1 /*
2  * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
3  * Copyright © 2018 Google, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  *
24  * Authors:
25  *    Rob Clark <robclark@freedesktop.org>
26  */
27 
28 #include "pipe/p_state.h"
29 #include "util/format/u_format.h"
30 #include "util/u_helpers.h"
31 #include "util/u_memory.h"
32 #include "util/u_string.h"
33 #include "util/u_viewport.h"
34 
35 #include "common/freedreno_guardband.h"
36 #include "freedreno_query_hw.h"
37 #include "freedreno_resource.h"
38 #include "freedreno_state.h"
39 #include "freedreno_tracepoints.h"
40 
41 #include "fd6_blend.h"
42 #include "fd6_const.h"
43 #include "fd6_context.h"
44 #include "fd6_emit.h"
45 #include "fd6_format.h"
46 #include "fd6_image.h"
47 #include "fd6_pack.h"
48 #include "fd6_program.h"
49 #include "fd6_rasterizer.h"
50 #include "fd6_texture.h"
51 #include "fd6_zsa.h"
52 
53 /* Border color layout is diff from a4xx/a5xx.. if it turns out to be
54  * the same as a6xx then move this somewhere common ;-)
55  *
56  * Entry layout looks like (total size, 0x60 bytes):
57  */
58 
59 struct PACKED bcolor_entry {
60    uint32_t fp32[4];
61    uint16_t ui16[4];
62    int16_t si16[4];
63    uint16_t fp16[4];
64    uint16_t rgb565;
65    uint16_t rgb5a1;
66    uint16_t rgba4;
67    uint8_t __pad0[2];
68    uint8_t ui8[4];
69    int8_t si8[4];
70    uint32_t rgb10a2;
71    uint32_t z24; /* also s8? */
72    uint16_t
73       srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */
74    uint8_t __pad1[56];
75 };
76 
77 #define FD6_BORDER_COLOR_SIZE sizeof(struct bcolor_entry)
78 #define FD6_BORDER_COLOR_UPLOAD_SIZE                                           \
79    (2 * PIPE_MAX_SAMPLERS * FD6_BORDER_COLOR_SIZE)
80 
81 static void
setup_border_colors(struct fd_texture_stateobj * tex,struct bcolor_entry * entries)82 setup_border_colors(struct fd_texture_stateobj *tex,
83                     struct bcolor_entry *entries)
84 {
85    unsigned i, j;
86    STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
87 
88    for (i = 0; i < tex->num_samplers; i++) {
89       struct bcolor_entry *e = &entries[i];
90       struct pipe_sampler_state *sampler = tex->samplers[i];
91       union pipe_color_union *bc;
92 
93       if (!sampler)
94          continue;
95 
96       bc = &sampler->border_color;
97 
98       /*
99        * XXX HACK ALERT XXX
100        *
101        * The border colors need to be swizzled in a particular
102        * format-dependent order. Even though samplers don't know about
103        * formats, we can assume that with a GL state tracker, there's a
104        * 1:1 correspondence between sampler and texture. Take advantage
105        * of that knowledge.
106        */
107       if ((i >= tex->num_textures) || !tex->textures[i])
108          continue;
109 
110       struct pipe_sampler_view *view = tex->textures[i];
111       enum pipe_format format = view->format;
112       const struct util_format_description *desc =
113          util_format_description(format);
114       const struct fd_resource *rsc = fd_resource(view->texture);
115 
116       e->rgb565 = 0;
117       e->rgb5a1 = 0;
118       e->rgba4 = 0;
119       e->rgb10a2 = 0;
120       e->z24 = 0;
121 
122       unsigned char swiz[4];
123 
124       fd6_tex_swiz(format, rsc->layout.tile_mode, swiz, view->swizzle_r, view->swizzle_g,
125                    view->swizzle_b, view->swizzle_a);
126 
127       for (j = 0; j < 4; j++) {
128          int c = swiz[j];
129          int cd = c;
130 
131          /*
132           * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the
133           * stencil border color value in bc->ui[0] but according
134           * to desc->swizzle and desc->channel, the .x/.w component
135           * is NONE and the stencil value is in the y component.
136           * Meanwhile the hardware wants this in the .w component
137           * for x24s8 and the .x component for x32_s8x24.
138           */
139          if ((format == PIPE_FORMAT_X24S8_UINT) ||
140              (format == PIPE_FORMAT_X32_S8X24_UINT)) {
141             if (j == 0) {
142                c = 1;
143                cd = (format == PIPE_FORMAT_X32_S8X24_UINT) ? 0 : 3;
144             } else {
145                continue;
146             }
147          }
148 
149          if (c >= 4)
150             continue;
151 
152          if (desc->channel[c].pure_integer) {
153             uint16_t clamped;
154             switch (desc->channel[c].size) {
155             case 2:
156                assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
157                clamped = CLAMP(bc->ui[j], 0, 0x3);
158                break;
159             case 8:
160                if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
161                   clamped = CLAMP(bc->i[j], -128, 127);
162                else
163                   clamped = CLAMP(bc->ui[j], 0, 255);
164                break;
165             case 10:
166                assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
167                clamped = CLAMP(bc->ui[j], 0, 0x3ff);
168                break;
169             case 16:
170                if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
171                   clamped = CLAMP(bc->i[j], -32768, 32767);
172                else
173                   clamped = CLAMP(bc->ui[j], 0, 65535);
174                break;
175             default:
176                assert(!"Unexpected bit size");
177             case 32:
178                clamped = 0;
179                break;
180             }
181             e->fp32[cd] = bc->ui[j];
182             e->fp16[cd] = clamped;
183          } else {
184             float f = bc->f[j];
185             float f_u = CLAMP(f, 0, 1);
186             float f_s = CLAMP(f, -1, 1);
187 
188             e->fp32[c] = fui(f);
189             e->fp16[c] = _mesa_float_to_half(f);
190             e->srgb[c] = _mesa_float_to_half(f_u);
191             e->ui16[c] = f_u * 0xffff;
192             e->si16[c] = f_s * 0x7fff;
193             e->ui8[c] = f_u * 0xff;
194             e->si8[c] = f_s * 0x7f;
195             if (c == 1)
196                e->rgb565 |= (int)(f_u * 0x3f) << 5;
197             else if (c < 3)
198                e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0);
199             if (c == 3)
200                e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0;
201             else
202                e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5);
203             if (c == 3)
204                e->rgb10a2 |= (int)(f_u * 0x3) << 30;
205             else
206                e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10);
207             e->rgba4 |= (int)(f_u * 0xf) << (c * 4);
208             if (c == 0)
209                e->z24 = f_u * 0xffffff;
210          }
211       }
212 
213 #ifdef DEBUG
214       memset(&e->__pad0, 0, sizeof(e->__pad0));
215       memset(&e->__pad1, 0, sizeof(e->__pad1));
216 #endif
217    }
218 }
219 
220 static void
emit_border_color(struct fd_context * ctx,struct fd_ringbuffer * ring)221 emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) assert_dt
222 {
223    struct fd6_context *fd6_ctx = fd6_context(ctx);
224    struct bcolor_entry *entries;
225    unsigned off;
226    void *ptr;
227 
228    STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
229 
230    u_upload_alloc(fd6_ctx->border_color_uploader, 0,
231                   FD6_BORDER_COLOR_UPLOAD_SIZE, FD6_BORDER_COLOR_UPLOAD_SIZE,
232                   &off, &fd6_ctx->border_color_buf, &ptr);
233 
234    entries = ptr;
235 
236    setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]);
237    setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT],
238                        &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]);
239 
240    OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2);
241    OUT_RELOC(ring, fd_resource(fd6_ctx->border_color_buf)->bo, off, 0, 0);
242 
243    u_upload_unmap(fd6_ctx->border_color_uploader);
244 }
245 
246 static void
fd6_emit_fb_tex(struct fd_ringbuffer * state,struct fd_context * ctx)247 fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx) assert_dt
248 {
249    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
250    struct pipe_surface *psurf = pfb->cbufs[0];
251    struct fd_resource *rsc = fd_resource(psurf->texture);
252 
253    OUT_RINGP(state, 0, &ctx->batch->fb_read_patches); /* texconst0, patched in gmem emit */
254    OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) |
255                       A6XX_TEX_CONST_1_HEIGHT(pfb->height));
256    OUT_RING(state, 0); /* texconst2, patched in gmem emit */
257    OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size));
258    OUT_RING(state, 0); /* BASE_LO, patched in gmem emit */
259    OUT_RING(state, 0); /* BASE_HI, patched in gmem emit */
260    OUT_RING(state, 0); /* texconst6 */
261    OUT_RING(state, 0); /* texconst7 */
262    OUT_RING(state, 0); /* texconst8 */
263    OUT_RING(state, 0); /* texconst9 */
264    OUT_RING(state, 0); /* texconst10 */
265    OUT_RING(state, 0); /* texconst11 */
266    OUT_RING(state, 0);
267    OUT_RING(state, 0);
268    OUT_RING(state, 0);
269    OUT_RING(state, 0);
270 }
271 
272 bool
fd6_emit_textures(struct fd_context * ctx,struct fd_ringbuffer * ring,enum pipe_shader_type type,struct fd_texture_stateobj * tex,unsigned bcolor_offset,const struct ir3_shader_variant * v)273 fd6_emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring,
274                   enum pipe_shader_type type, struct fd_texture_stateobj *tex,
275                   unsigned bcolor_offset,
276                   /* can be NULL if no image/SSBO/fb state to merge in: */
277                   const struct ir3_shader_variant *v)
278 {
279    bool needs_border = false;
280    unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg;
281    enum a6xx_state_block sb;
282 
283    switch (type) {
284    case PIPE_SHADER_VERTEX:
285       sb = SB6_VS_TEX;
286       opcode = CP_LOAD_STATE6_GEOM;
287       tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP;
288       tex_const_reg = REG_A6XX_SP_VS_TEX_CONST;
289       tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
290       break;
291    case PIPE_SHADER_TESS_CTRL:
292       sb = SB6_HS_TEX;
293       opcode = CP_LOAD_STATE6_GEOM;
294       tex_samp_reg = REG_A6XX_SP_HS_TEX_SAMP;
295       tex_const_reg = REG_A6XX_SP_HS_TEX_CONST;
296       tex_count_reg = REG_A6XX_SP_HS_TEX_COUNT;
297       break;
298    case PIPE_SHADER_TESS_EVAL:
299       sb = SB6_DS_TEX;
300       opcode = CP_LOAD_STATE6_GEOM;
301       tex_samp_reg = REG_A6XX_SP_DS_TEX_SAMP;
302       tex_const_reg = REG_A6XX_SP_DS_TEX_CONST;
303       tex_count_reg = REG_A6XX_SP_DS_TEX_COUNT;
304       break;
305    case PIPE_SHADER_GEOMETRY:
306       sb = SB6_GS_TEX;
307       opcode = CP_LOAD_STATE6_GEOM;
308       tex_samp_reg = REG_A6XX_SP_GS_TEX_SAMP;
309       tex_const_reg = REG_A6XX_SP_GS_TEX_CONST;
310       tex_count_reg = REG_A6XX_SP_GS_TEX_COUNT;
311       break;
312    case PIPE_SHADER_FRAGMENT:
313       sb = SB6_FS_TEX;
314       opcode = CP_LOAD_STATE6_FRAG;
315       tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP;
316       tex_const_reg = REG_A6XX_SP_FS_TEX_CONST;
317       tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
318       break;
319    case PIPE_SHADER_COMPUTE:
320       sb = SB6_CS_TEX;
321       opcode = CP_LOAD_STATE6_FRAG;
322       tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP;
323       tex_const_reg = REG_A6XX_SP_CS_TEX_CONST;
324       tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
325       break;
326    default:
327       unreachable("bad state block");
328    }
329 
330    if (tex->num_samplers > 0) {
331       struct fd_ringbuffer *state =
332          fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4 * 4);
333       for (unsigned i = 0; i < tex->num_samplers; i++) {
334          static const struct fd6_sampler_stateobj dummy_sampler = {};
335          const struct fd6_sampler_stateobj *sampler =
336             tex->samplers[i] ? fd6_sampler_stateobj(tex->samplers[i])
337                              : &dummy_sampler;
338          OUT_RING(state, sampler->texsamp0);
339          OUT_RING(state, sampler->texsamp1);
340          OUT_RING(state, sampler->texsamp2 |
341                             A6XX_TEX_SAMP_2_BCOLOR(i + bcolor_offset));
342          OUT_RING(state, sampler->texsamp3);
343          needs_border |= sampler->needs_border;
344       }
345 
346       /* output sampler state: */
347       OUT_PKT7(ring, opcode, 3);
348       OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
349                         CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
350                         CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
351                         CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
352                         CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers));
353       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
354 
355       OUT_PKT4(ring, tex_samp_reg, 2);
356       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
357 
358       fd_ringbuffer_del(state);
359    }
360 
361    unsigned num_merged_textures = tex->num_textures;
362    unsigned num_textures = tex->num_textures;
363    if (v) {
364       num_merged_textures += v->image_mapping.num_tex;
365 
366       if (v->fb_read)
367          num_merged_textures++;
368 
369       /* There could be more bound textures than what the shader uses.
370        * Which isn't known at shader compile time.  So in the case we
371        * are merging tex state, only emit the textures that the shader
372        * uses (since the image/SSBO related tex state comes immediately
373        * after)
374        */
375       num_textures = v->image_mapping.tex_base;
376    }
377 
378    if (num_merged_textures > 0) {
379       struct fd_ringbuffer *state =
380          fd_ringbuffer_new_object(ctx->pipe, num_merged_textures * 16 * 4);
381       for (unsigned i = 0; i < num_textures; i++) {
382          const struct fd6_pipe_sampler_view *view;
383 
384          if (tex->textures[i]) {
385             view = fd6_pipe_sampler_view(tex->textures[i]);
386             if (unlikely(view->rsc_seqno !=
387                          fd_resource(view->base.texture)->seqno)) {
388                fd6_sampler_view_update(ctx,
389                                        fd6_pipe_sampler_view(tex->textures[i]));
390             }
391          } else {
392             static const struct fd6_pipe_sampler_view dummy_view = {};
393             view = &dummy_view;
394          }
395 
396          OUT_RING(state, view->texconst0);
397          OUT_RING(state, view->texconst1);
398          OUT_RING(state, view->texconst2);
399          OUT_RING(state, view->texconst3);
400 
401          if (view->ptr1) {
402             OUT_RELOC(state, view->ptr1->bo, view->offset1,
403                       (uint64_t)view->texconst5 << 32, 0);
404          } else {
405             OUT_RING(state, 0x00000000);
406             OUT_RING(state, view->texconst5);
407          }
408 
409          OUT_RING(state, view->texconst6);
410 
411          if (view->ptr2) {
412             OUT_RELOC(state, view->ptr2->bo, view->offset2, 0, 0);
413          } else {
414             OUT_RING(state, 0);
415             OUT_RING(state, 0);
416          }
417 
418          OUT_RING(state, view->texconst9);
419          OUT_RING(state, view->texconst10);
420          OUT_RING(state, view->texconst11);
421          OUT_RING(state, 0);
422          OUT_RING(state, 0);
423          OUT_RING(state, 0);
424          OUT_RING(state, 0);
425       }
426 
427       if (v) {
428          const struct ir3_ibo_mapping *mapping = &v->image_mapping;
429          struct fd_shaderbuf_stateobj *buf = &ctx->shaderbuf[type];
430          struct fd_shaderimg_stateobj *img = &ctx->shaderimg[type];
431 
432          for (unsigned i = 0; i < mapping->num_tex; i++) {
433             unsigned idx = mapping->tex_to_image[i];
434             if (idx & IBO_SSBO) {
435                fd6_emit_ssbo_tex(state, &buf->sb[idx & ~IBO_SSBO]);
436             } else {
437                fd6_emit_image_tex(state, &img->si[idx]);
438             }
439          }
440 
441          if (v->fb_read) {
442             fd6_emit_fb_tex(state, ctx);
443          }
444       }
445 
446       /* emit texture state: */
447       OUT_PKT7(ring, opcode, 3);
448       OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
449                         CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
450                         CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
451                         CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
452                         CP_LOAD_STATE6_0_NUM_UNIT(num_merged_textures));
453       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
454 
455       OUT_PKT4(ring, tex_const_reg, 2);
456       OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
457 
458       fd_ringbuffer_del(state);
459    }
460 
461    OUT_PKT4(ring, tex_count_reg, 1);
462    OUT_RING(ring, num_merged_textures);
463 
464    return needs_border;
465 }
466 
467 /* Emits combined texture state, which also includes any Image/SSBO
468  * related texture state merged in (because we must have all texture
469  * state for a given stage in a single buffer).  In the fast-path, if
470  * we don't need to merge in any image/ssbo related texture state, we
471  * just use cached texture stateobj.  Otherwise we generate a single-
472  * use stateobj.
473  *
474  * TODO Is there some sane way we can still use cached texture stateobj
475  * with image/ssbo in use?
476  *
477  * returns whether border_color is required:
478  */
479 static bool
fd6_emit_combined_textures(struct fd_ringbuffer * ring,struct fd6_emit * emit,enum pipe_shader_type type,const struct ir3_shader_variant * v)480 fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit,
481                            enum pipe_shader_type type,
482                            const struct ir3_shader_variant *v) assert_dt
483 {
484    struct fd_context *ctx = emit->ctx;
485    bool needs_border = false;
486 
487    static const struct {
488       enum fd6_state_id state_id;
489       unsigned enable_mask;
490    } s[PIPE_SHADER_TYPES] = {
491       [PIPE_SHADER_VERTEX] = {FD6_GROUP_VS_TEX, ENABLE_ALL},
492       [PIPE_SHADER_TESS_CTRL] = {FD6_GROUP_HS_TEX, ENABLE_ALL},
493       [PIPE_SHADER_TESS_EVAL] = {FD6_GROUP_DS_TEX, ENABLE_ALL},
494       [PIPE_SHADER_GEOMETRY] = {FD6_GROUP_GS_TEX, ENABLE_ALL},
495       [PIPE_SHADER_FRAGMENT] = {FD6_GROUP_FS_TEX, ENABLE_DRAW},
496    };
497 
498    debug_assert(s[type].state_id);
499 
500    if (!v->image_mapping.num_tex && !v->fb_read) {
501       /* in the fast-path, when we don't have to mix in any image/SSBO
502        * related texture state, we can just lookup the stateobj and
503        * re-emit that:
504        *
505        * Also, framebuffer-read is a slow-path because an extra
506        * texture needs to be inserted.
507        *
508        * TODO we can probably simmplify things if we also treated
509        * border_color as a slow-path.. this way the tex state key
510        * wouldn't depend on bcolor_offset.. but fb_read might rather
511        * be *somehow* a fast-path if we eventually used it for PLS.
512        * I suppose there would be no harm in just *always* inserting
513        * an fb_read texture?
514        */
515       if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) &&
516           ctx->tex[type].num_textures > 0) {
517          struct fd6_texture_state *tex =
518             fd6_texture_state(ctx, type, &ctx->tex[type]);
519 
520          needs_border |= tex->needs_border;
521 
522          fd6_emit_add_group(emit, tex->stateobj, s[type].state_id,
523                             s[type].enable_mask);
524 
525          fd6_texture_state_reference(&tex, NULL);
526       }
527    } else {
528       /* In the slow-path, create a one-shot texture state object
529        * if either TEX|PROG|SSBO|IMAGE state is dirty:
530        */
531       if ((ctx->dirty_shader[type] &
532            (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE |
533             FD_DIRTY_SHADER_SSBO)) ||
534           v->fb_read) {
535          struct fd_texture_stateobj *tex = &ctx->tex[type];
536          struct fd_ringbuffer *stateobj = fd_submit_new_ringbuffer(
537             ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
538          unsigned bcolor_offset = fd6_border_color_offset(ctx, type, tex);
539 
540          needs_border |=
541             fd6_emit_textures(ctx, stateobj, type, tex, bcolor_offset, v);
542 
543          fd6_emit_take_group(emit, stateobj, s[type].state_id,
544                              s[type].enable_mask);
545       }
546    }
547 
548    return needs_border;
549 }
550 
551 static struct fd_ringbuffer *
build_vbo_state(struct fd6_emit * emit)552 build_vbo_state(struct fd6_emit *emit) assert_dt
553 {
554    const struct fd_vertex_state *vtx = emit->vtx;
555 
556    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
557       emit->ctx->batch->submit, 4 * (1 + vtx->vertexbuf.count * 4),
558       FD_RINGBUFFER_STREAMING);
559 
560    OUT_PKT4(ring, REG_A6XX_VFD_FETCH(0), 4 * vtx->vertexbuf.count);
561    for (int32_t j = 0; j < vtx->vertexbuf.count; j++) {
562       const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j];
563       struct fd_resource *rsc = fd_resource(vb->buffer.resource);
564       if (rsc == NULL) {
565          OUT_RING(ring, 0);
566          OUT_RING(ring, 0);
567          OUT_RING(ring, 0);
568          OUT_RING(ring, 0);
569       } else {
570          uint32_t off = vb->buffer_offset;
571          uint32_t size = fd_bo_size(rsc->bo) - off;
572 
573          OUT_RELOC(ring, rsc->bo, off, 0, 0);
574          OUT_RING(ring, size);       /* VFD_FETCH[j].SIZE */
575          OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */
576       }
577    }
578 
579    return ring;
580 }
581 
582 static enum a6xx_ztest_mode
compute_ztest_mode(struct fd6_emit * emit,bool lrz_valid)583 compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt
584 {
585    struct fd_context *ctx = emit->ctx;
586    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
587    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
588    const struct ir3_shader_variant *fs = emit->fs;
589 
590    if (fs->shader->nir->info.fs.early_fragment_tests)
591       return A6XX_EARLY_Z;
592 
593    if (fs->no_earlyz || fs->writes_pos || !zsa->base.depth_enabled ||
594        fs->writes_stencilref) {
595       return A6XX_LATE_Z;
596    } else if ((fs->has_kill || zsa->alpha_test) &&
597               (zsa->writes_zs || !pfb->zsbuf)) {
598       /* Slightly odd, but seems like the hw wants us to select
599        * LATE_Z mode if there is no depth buffer + discard.  Either
600        * that, or when occlusion query is enabled.  See:
601        *
602        * dEQP-GLES31.functional.fbo.no_attachments.*
603        */
604       return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
605    } else {
606       return A6XX_EARLY_Z;
607    }
608 }
609 
610 /**
611  * Calculate normalized LRZ state based on zsa/prog/blend state, updating
612  * the zsbuf's lrz state as necessary to detect the cases where we need
613  * to invalidate lrz.
614  */
615 static struct fd6_lrz_state
compute_lrz_state(struct fd6_emit * emit,bool binning_pass)616 compute_lrz_state(struct fd6_emit *emit, bool binning_pass) assert_dt
617 {
618    struct fd_context *ctx = emit->ctx;
619    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
620    const struct ir3_shader_variant *fs = emit->fs;
621    struct fd6_lrz_state lrz;
622 
623    if (!pfb->zsbuf) {
624       memset(&lrz, 0, sizeof(lrz));
625       if (!binning_pass) {
626          lrz.z_mode = compute_ztest_mode(emit, false);
627       }
628       return lrz;
629    }
630 
631    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
632    struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
633    struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
634 
635    lrz = zsa->lrz;
636 
637    /* normalize lrz state: */
638    if (blend->reads_dest || fs->writes_pos || fs->no_earlyz || fs->has_kill) {
639       lrz.write = false;
640       if (binning_pass)
641          lrz.enable = false;
642    }
643 
644    /* if we change depthfunc direction, bail out on using LRZ.  The
645     * LRZ buffer encodes a min/max depth value per block, but if
646     * we switch from GT/GE <-> LT/LE, those values cannot be
647     * interpreted properly.
648     */
649    if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) &&
650        (rsc->lrz_direction != lrz.direction)) {
651       rsc->lrz_valid = false;
652    }
653 
654    if (zsa->invalidate_lrz || !rsc->lrz_valid) {
655       rsc->lrz_valid = false;
656       memset(&lrz, 0, sizeof(lrz));
657    }
658 
659    if (fs->no_earlyz || fs->writes_pos) {
660       lrz.enable = false;
661       lrz.write = false;
662       lrz.test = false;
663    }
664 
665    if (!binning_pass) {
666       lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid);
667    }
668 
669    /* Once we start writing to the real depth buffer, we lock in the
670     * direction for LRZ.. if we have to skip a LRZ write for any
671     * reason, it is still safe to have LRZ until there is a direction
672     * reversal.  Prior to the reversal, since we disabled LRZ writes
673     * in the "unsafe" cases, this just means that the LRZ test may
674     * not early-discard some things that end up not passing a later
675     * test (ie. be overly concervative).  But once you have a reversal
676     * of direction, it is possible to increase/decrease the z value
677     * to the point where the overly-conservative test is incorrect.
678     */
679    if (zsa->base.depth_writemask) {
680       rsc->lrz_direction = lrz.direction;
681    }
682 
683    return lrz;
684 }
685 
686 static struct fd_ringbuffer *
build_lrz(struct fd6_emit * emit,bool binning_pass)687 build_lrz(struct fd6_emit *emit, bool binning_pass) assert_dt
688 {
689    struct fd_context *ctx = emit->ctx;
690    struct fd6_context *fd6_ctx = fd6_context(ctx);
691    struct fd6_lrz_state lrz = compute_lrz_state(emit, binning_pass);
692 
693    /* If the LRZ state has not changed, we can skip the emit: */
694    if (!ctx->last.dirty &&
695        !memcmp(&fd6_ctx->last.lrz[binning_pass], &lrz, sizeof(lrz)))
696       return NULL;
697 
698    fd6_ctx->last.lrz[binning_pass] = lrz;
699 
700    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
701       ctx->batch->submit, 8 * 4, FD_RINGBUFFER_STREAMING);
702 
703    OUT_REG(ring,
704            A6XX_GRAS_LRZ_CNTL(.enable = lrz.enable, .lrz_write = lrz.write,
705                               .greater = lrz.direction == FD_LRZ_GREATER,
706                               .z_test_enable = lrz.test, ));
707    OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, ));
708 
709    OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
710 
711    OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, ));
712 
713    return ring;
714 }
715 
716 static struct fd_ringbuffer *
build_scissor(struct fd6_emit * emit)717 build_scissor(struct fd6_emit *emit) assert_dt
718 {
719    struct fd_context *ctx = emit->ctx;
720    struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
721 
722    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
723       emit->ctx->batch->submit, 3 * 4, FD_RINGBUFFER_STREAMING);
724 
725    OUT_REG(
726       ring,
727       A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = scissor->minx, .y = scissor->miny),
728       A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1,
729                                      .y = MAX2(scissor->maxy, 1) - 1));
730 
731    ctx->batch->max_scissor.minx =
732       MIN2(ctx->batch->max_scissor.minx, scissor->minx);
733    ctx->batch->max_scissor.miny =
734       MIN2(ctx->batch->max_scissor.miny, scissor->miny);
735    ctx->batch->max_scissor.maxx =
736       MAX2(ctx->batch->max_scissor.maxx, scissor->maxx);
737    ctx->batch->max_scissor.maxy =
738       MAX2(ctx->batch->max_scissor.maxy, scissor->maxy);
739 
740    return ring;
741 }
742 
743 /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD |
744  * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND
745  */
746 static struct fd_ringbuffer *
build_prog_fb_rast(struct fd6_emit * emit)747 build_prog_fb_rast(struct fd6_emit *emit) assert_dt
748 {
749    struct fd_context *ctx = emit->ctx;
750    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
751    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
752    const struct ir3_shader_variant *fs = emit->fs;
753 
754    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
755       ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
756 
757    unsigned nr = pfb->nr_cbufs;
758 
759    if (ctx->rasterizer->rasterizer_discard)
760       nr = 0;
761 
762    struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
763 
764    if (blend->use_dual_src_blend)
765       nr++;
766 
767    OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
768    OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
769                      COND(fs->writes_smask && pfb->samples > 1,
770                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
771                      COND(fs->writes_stencilref,
772                           A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
773                      COND(blend->use_dual_src_blend,
774                           A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
775    OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr));
776 
777    OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1);
778    OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
779 
780    unsigned mrt_components = 0;
781    for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
782       if (!pfb->cbufs[i])
783          continue;
784       mrt_components |= 0xf << (i * 4);
785    }
786 
787    /* dual source blending has an extra fs output in the 2nd slot */
788    if (blend->use_dual_src_blend)
789       mrt_components |= 0xf << 4;
790 
791    mrt_components &= prog->mrt_components;
792 
793    OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components));
794    OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components));
795 
796    return ring;
797 }
798 
799 static struct fd_ringbuffer *
build_blend_color(struct fd6_emit * emit)800 build_blend_color(struct fd6_emit *emit) assert_dt
801 {
802    struct fd_context *ctx = emit->ctx;
803    struct pipe_blend_color *bcolor = &ctx->blend_color;
804    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
805       ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING);
806 
807    OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]),
808            A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]),
809            A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]),
810            A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
811 
812    return ring;
813 }
814 
815 static struct fd_ringbuffer *
build_ibo(struct fd6_emit * emit)816 build_ibo(struct fd6_emit *emit) assert_dt
817 {
818    struct fd_context *ctx = emit->ctx;
819 
820    if (emit->hs) {
821       debug_assert(ir3_shader_nibo(emit->hs) == 0);
822       debug_assert(ir3_shader_nibo(emit->ds) == 0);
823    }
824    if (emit->gs) {
825       debug_assert(ir3_shader_nibo(emit->gs) == 0);
826    }
827 
828    struct fd_ringbuffer *ibo_state =
829       fd6_build_ibo_state(ctx, emit->fs, PIPE_SHADER_FRAGMENT);
830    struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(
831       ctx->batch->submit, 0x100, FD_RINGBUFFER_STREAMING);
832 
833    OUT_PKT7(ring, CP_LOAD_STATE6, 3);
834    OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
835                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
836                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
837                      CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
838                      CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(emit->fs)));
839    OUT_RB(ring, ibo_state);
840 
841    OUT_PKT4(ring, REG_A6XX_SP_IBO, 2);
842    OUT_RB(ring, ibo_state);
843 
844    /* TODO if we used CP_SET_DRAW_STATE for compute shaders, we could
845     * de-duplicate this from program->config_stateobj
846     */
847    OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1);
848    OUT_RING(ring, ir3_shader_nibo(emit->fs));
849 
850    fd_ringbuffer_del(ibo_state);
851 
852    return ring;
853 }
854 
855 static void
fd6_emit_streamout(struct fd_ringbuffer * ring,struct fd6_emit * emit)856 fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
857 {
858    struct fd_context *ctx = emit->ctx;
859    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
860    struct ir3_stream_output_info *info = prog->stream_output;
861    struct fd_streamout_stateobj *so = &ctx->streamout;
862 
863    emit->streamout_mask = 0;
864 
865    if (!info)
866       return;
867 
868    for (unsigned i = 0; i < so->num_targets; i++) {
869       struct fd_stream_output_target *target =
870          fd_stream_output_target(so->targets[i]);
871 
872       if (!target)
873          continue;
874 
875       target->stride = info->stride[i];
876 
877       OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3);
878       /* VPC_SO[i].BUFFER_BASE_LO: */
879       OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0);
880       OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset);
881 
882       struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo;
883 
884       if (so->reset & (1 << i)) {
885          assert(so->offsets[i] == 0);
886 
887          OUT_PKT7(ring, CP_MEM_WRITE, 3);
888          OUT_RELOC(ring, offset_bo, 0, 0, 0);
889          OUT_RING(ring, target->base.buffer_offset);
890 
891          OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1);
892          OUT_RING(ring, target->base.buffer_offset);
893       } else {
894          OUT_PKT7(ring, CP_MEM_TO_REG, 3);
895          OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
896                            CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
897                            CP_MEM_TO_REG_0_CNT(0));
898          OUT_RELOC(ring, offset_bo, 0, 0, 0);
899       }
900 
901       // After a draw HW would write the new offset to offset_bo
902       OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
903       OUT_RELOC(ring, offset_bo, 0, 0, 0);
904 
905       so->reset &= ~(1 << i);
906 
907       emit->streamout_mask |= (1 << i);
908    }
909 
910    if (emit->streamout_mask) {
911       fd6_emit_add_group(emit, prog->streamout_stateobj, FD6_GROUP_SO,
912                          ENABLE_ALL);
913    } else if (ctx->last.streamout_mask != 0) {
914       /* If we transition from a draw with streamout to one without, turn
915        * off streamout.
916        */
917       fd6_emit_add_group(emit, fd6_context(ctx)->streamout_disable_stateobj,
918                          FD6_GROUP_SO, ENABLE_ALL);
919    }
920 
921    /* Make sure that any use of our TFB outputs (indirect draw source or shader
922     * UBO reads) comes after the TFB output is written.  From the GL 4.6 core
923     * spec:
924     *
925     *     "Buffers should not be bound or in use for both transform feedback and
926     *      other purposes in the GL.  Specifically, if a buffer object is
927     *      simultaneously bound to a transform feedback buffer binding point
928     *      and elsewhere in the GL, any writes to or reads from the buffer
929     *      generate undefined values."
930     *
931     * So we idle whenever SO buffers change.  Note that this function is called
932     * on every draw with TFB enabled, so check the dirty flag for the buffers
933     * themselves.
934     */
935    if (ctx->dirty & FD_DIRTY_STREAMOUT)
936       fd_wfi(ctx->batch, ring);
937 
938    ctx->last.streamout_mask = emit->streamout_mask;
939 }
940 
941 /**
942  * Stuff that less frequently changes and isn't (yet) moved into stategroups
943  */
944 static void
fd6_emit_non_ring(struct fd_ringbuffer * ring,struct fd6_emit * emit)945 fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt
946 {
947    struct fd_context *ctx = emit->ctx;
948    const enum fd_dirty_3d_state dirty = emit->dirty;
949 
950    if (dirty & FD_DIRTY_STENCIL_REF) {
951       struct pipe_stencil_ref *sr = &ctx->stencil_ref;
952 
953       OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1);
954       OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) |
955                         A6XX_RB_STENCILREF_BFREF(sr->ref_value[1]));
956    }
957 
958    if (dirty & FD_DIRTY_VIEWPORT) {
959       struct pipe_scissor_state *scissor = &ctx->viewport_scissor;
960 
961       OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(0, ctx->viewport.translate[0]),
962               A6XX_GRAS_CL_VPORT_XSCALE(0, ctx->viewport.scale[0]),
963               A6XX_GRAS_CL_VPORT_YOFFSET(0, ctx->viewport.translate[1]),
964               A6XX_GRAS_CL_VPORT_YSCALE(0, ctx->viewport.scale[1]),
965               A6XX_GRAS_CL_VPORT_ZOFFSET(0, ctx->viewport.translate[2]),
966               A6XX_GRAS_CL_VPORT_ZSCALE(0, ctx->viewport.scale[2]));
967 
968       OUT_REG(
969          ring,
970          A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = scissor->minx,
971                                           .y = scissor->miny),
972          A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1,
973                                           .y = MAX2(scissor->maxy, 1) - 1));
974 
975       unsigned guardband_x = fd_calc_guardband(ctx->viewport.translate[0],
976                                                ctx->viewport.scale[0], false);
977       unsigned guardband_y = fd_calc_guardband(ctx->viewport.translate[1],
978                                                ctx->viewport.scale[1], false);
979 
980       OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = guardband_x,
981                                                     .vert = guardband_y));
982    }
983 
984    /* The clamp ranges are only used when the rasterizer wants depth
985     * clamping.
986     */
987    if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER)) &&
988        fd_depth_clamp_enabled(ctx)) {
989       float zmin, zmax;
990       util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz,
991                               &zmin, &zmax);
992 
993       OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(0, zmin),
994               A6XX_GRAS_CL_Z_CLAMP_MAX(0, zmax));
995 
996       OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax));
997    }
998 }
999 
1000 void
fd6_emit_state(struct fd_ringbuffer * ring,struct fd6_emit * emit)1001 fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
1002 {
1003    struct fd_context *ctx = emit->ctx;
1004    struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
1005    const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
1006    const struct ir3_shader_variant *vs = emit->vs;
1007    const struct ir3_shader_variant *hs = emit->hs;
1008    const struct ir3_shader_variant *ds = emit->ds;
1009    const struct ir3_shader_variant *gs = emit->gs;
1010    const struct ir3_shader_variant *fs = emit->fs;
1011    bool needs_border = false;
1012 
1013    emit_marker6(ring, 5);
1014 
1015    /* NOTE: we track fb_read differently than _BLEND_ENABLED since we
1016     * might decide to do sysmem in some cases when blend is enabled:
1017     */
1018    if (fs->fb_read)
1019       ctx->batch->gmem_reason |= FD_GMEM_FB_READ;
1020 
1021    u_foreach_bit (b, emit->dirty_groups) {
1022       enum fd6_state_id group = b;
1023       struct fd_ringbuffer *state = NULL;
1024       uint32_t enable_mask = ENABLE_ALL;
1025 
1026       switch (group) {
1027       case FD6_GROUP_VTXSTATE:
1028          state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj;
1029          fd_ringbuffer_ref(state);
1030          break;
1031       case FD6_GROUP_VBO:
1032          state = build_vbo_state(emit);
1033          break;
1034       case FD6_GROUP_ZSA:
1035          state = fd6_zsa_state(
1036             ctx,
1037             util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])),
1038             fd_depth_clamp_enabled(ctx));
1039          fd_ringbuffer_ref(state);
1040          break;
1041       case FD6_GROUP_LRZ:
1042          state = build_lrz(emit, false);
1043          if (!state)
1044             continue;
1045          enable_mask = ENABLE_DRAW;
1046          break;
1047       case FD6_GROUP_LRZ_BINNING:
1048          state = build_lrz(emit, true);
1049          if (!state)
1050             continue;
1051          enable_mask = CP_SET_DRAW_STATE__0_BINNING;
1052          break;
1053       case FD6_GROUP_SCISSOR:
1054          state = build_scissor(emit);
1055          break;
1056       case FD6_GROUP_PROG:
1057          fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG,
1058                             ENABLE_ALL);
1059          fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, ENABLE_DRAW);
1060          fd6_emit_add_group(emit, prog->binning_stateobj,
1061                             FD6_GROUP_PROG_BINNING,
1062                             CP_SET_DRAW_STATE__0_BINNING);
1063 
1064          /* emit remaining streaming program state, ie. what depends on
1065           * other emit state, so cannot be pre-baked.
1066           */
1067          fd6_emit_take_group(emit, fd6_program_interp_state(emit),
1068                              FD6_GROUP_PROG_INTERP, ENABLE_DRAW);
1069          continue;
1070       case FD6_GROUP_RASTERIZER:
1071          state = fd6_rasterizer_state(ctx, emit->primitive_restart);
1072          fd_ringbuffer_ref(state);
1073          break;
1074       case FD6_GROUP_PROG_FB_RAST:
1075          state = build_prog_fb_rast(emit);
1076          break;
1077       case FD6_GROUP_BLEND:
1078          state = fd6_blend_variant(ctx->blend, pfb->samples, ctx->sample_mask)
1079                     ->stateobj;
1080          fd_ringbuffer_ref(state);
1081          break;
1082       case FD6_GROUP_BLEND_COLOR:
1083          state = build_blend_color(emit);
1084          break;
1085       case FD6_GROUP_IBO:
1086          state = build_ibo(emit);
1087          break;
1088       case FD6_GROUP_CONST:
1089          state = fd6_build_user_consts(emit);
1090          break;
1091       case FD6_GROUP_VS_DRIVER_PARAMS:
1092          state = fd6_build_vs_driver_params(emit);
1093          break;
1094       case FD6_GROUP_PRIMITIVE_PARAMS:
1095          state = fd6_build_tess_consts(emit);
1096          break;
1097       case FD6_GROUP_VS_TEX:
1098          needs_border |=
1099             fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vs);
1100          continue;
1101       case FD6_GROUP_HS_TEX:
1102          if (hs) {
1103             needs_border |= fd6_emit_combined_textures(
1104                ring, emit, PIPE_SHADER_TESS_CTRL, hs);
1105          }
1106          continue;
1107       case FD6_GROUP_DS_TEX:
1108          if (ds) {
1109             needs_border |= fd6_emit_combined_textures(
1110                ring, emit, PIPE_SHADER_TESS_EVAL, ds);
1111          }
1112          continue;
1113       case FD6_GROUP_GS_TEX:
1114          if (gs) {
1115             needs_border |=
1116                fd6_emit_combined_textures(ring, emit, PIPE_SHADER_GEOMETRY, gs);
1117          }
1118          continue;
1119       case FD6_GROUP_FS_TEX:
1120          needs_border |=
1121             fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fs);
1122          continue;
1123       case FD6_GROUP_SO:
1124          fd6_emit_streamout(ring, emit);
1125          continue;
1126       case FD6_GROUP_NON_GROUP:
1127          fd6_emit_non_ring(ring, emit);
1128          continue;
1129       default:
1130          unreachable("bad state group");
1131       }
1132 
1133       fd6_emit_take_group(emit, state, group, enable_mask);
1134    }
1135 
1136    if (needs_border)
1137       emit_border_color(ctx, ring);
1138 
1139    if (emit->num_groups > 0) {
1140       OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * emit->num_groups);
1141       for (unsigned i = 0; i < emit->num_groups; i++) {
1142          struct fd6_state_group *g = &emit->groups[i];
1143          unsigned n = g->stateobj ? fd_ringbuffer_size(g->stateobj) / 4 : 0;
1144 
1145          debug_assert((g->enable_mask & ~ENABLE_ALL) == 0);
1146 
1147          if (n == 0) {
1148             OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1149                               CP_SET_DRAW_STATE__0_DISABLE | g->enable_mask |
1150                               CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1151             OUT_RING(ring, 0x00000000);
1152             OUT_RING(ring, 0x00000000);
1153          } else {
1154             OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | g->enable_mask |
1155                               CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1156             OUT_RB(ring, g->stateobj);
1157          }
1158 
1159          if (g->stateobj)
1160             fd_ringbuffer_del(g->stateobj);
1161       }
1162       emit->num_groups = 0;
1163    }
1164 }
1165 
1166 void
fd6_emit_cs_state(struct fd_context * ctx,struct fd_ringbuffer * ring,struct ir3_shader_variant * cp)1167 fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
1168                   struct ir3_shader_variant *cp)
1169 {
1170    enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE];
1171 
1172    if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG |
1173                 FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) {
1174       struct fd_texture_stateobj *tex = &ctx->tex[PIPE_SHADER_COMPUTE];
1175       unsigned bcolor_offset =
1176          fd6_border_color_offset(ctx, PIPE_SHADER_COMPUTE, tex);
1177 
1178       bool needs_border = fd6_emit_textures(ctx, ring, PIPE_SHADER_COMPUTE, tex,
1179                                             bcolor_offset, cp);
1180 
1181       if (needs_border)
1182          emit_border_color(ctx, ring);
1183 
1184       OUT_PKT4(ring, REG_A6XX_SP_VS_TEX_COUNT, 1);
1185       OUT_RING(ring, 0);
1186 
1187       OUT_PKT4(ring, REG_A6XX_SP_HS_TEX_COUNT, 1);
1188       OUT_RING(ring, 0);
1189 
1190       OUT_PKT4(ring, REG_A6XX_SP_DS_TEX_COUNT, 1);
1191       OUT_RING(ring, 0);
1192 
1193       OUT_PKT4(ring, REG_A6XX_SP_GS_TEX_COUNT, 1);
1194       OUT_RING(ring, 0);
1195 
1196       OUT_PKT4(ring, REG_A6XX_SP_FS_TEX_COUNT, 1);
1197       OUT_RING(ring, 0);
1198    }
1199 
1200    if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) {
1201       struct fd_ringbuffer *state =
1202          fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE);
1203 
1204       OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
1205       OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
1206                         CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) |
1207                         CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1208                         CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
1209                         CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(cp)));
1210       OUT_RB(ring, state);
1211 
1212       OUT_PKT4(ring, REG_A6XX_SP_CS_IBO, 2);
1213       OUT_RB(ring, state);
1214 
1215       OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1);
1216       OUT_RING(ring, ir3_shader_nibo(cp));
1217 
1218       fd_ringbuffer_del(state);
1219    }
1220 }
1221 
1222 /* emit setup at begin of new cmdstream buffer (don't rely on previous
1223  * state, there could have been a context switch between ioctls):
1224  */
1225 void
fd6_emit_restore(struct fd_batch * batch,struct fd_ringbuffer * ring)1226 fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
1227 {
1228    struct fd_screen *screen = batch->ctx->screen;
1229 
1230    if (!batch->nondraw) {
1231       trace_start_state_restore(&batch->trace, ring);
1232    }
1233 
1234    fd6_cache_inv(batch, ring);
1235 
1236    OUT_REG(ring,
1237            A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true,
1238                                     .ds_state = true, .gs_state = true,
1239                                     .fs_state = true, .cs_state = true,
1240                                     .gfx_ibo = true, .cs_ibo = true,
1241                                     .gfx_shared_const = true,
1242                                     .cs_shared_const = true,
1243                                     .gfx_bindless = 0x1f, .cs_bindless = 0x1f));
1244 
1245    OUT_WFI5(ring);
1246 
1247    WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x0);
1248    WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF);
1249    WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0);
1250    WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
1251    WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
1252    WRITE(REG_A6XX_TPL1_DBG_ECO_CNTL, screen->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
1253    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1254    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1255 
1256    WRITE(REG_A6XX_VPC_UNKNOWN_9600, 0);
1257    WRITE(REG_A6XX_GRAS_DBG_ECO_CNTL, 0x880);
1258    WRITE(REG_A6XX_HLSQ_UNKNOWN_BE04, 0x80000);
1259    WRITE(REG_A6XX_SP_CHICKEN_BITS, 0x1430);
1260    WRITE(REG_A6XX_SP_IBO_COUNT, 0);
1261    WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
1262    WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0);
1263    WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1264    WRITE(REG_A6XX_UCHE_CLIENT_PF, 4);
1265    WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1);
1266    WRITE(REG_A6XX_SP_MODE_CONTROL,
1267          A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
1268    WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
1269    WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1270    WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f);
1271 
1272    WRITE(REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 0);
1273    WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0);
1274    WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
1275 
1276    WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
1277    WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
1278    WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
1279    WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
1280    WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
1281    WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
1282    WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
1283    WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
1284 
1285    WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value);
1286    WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0);
1287 
1288    WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value);
1289 
1290    WRITE(REG_A6XX_PC_RASTER_CNTL, 0);
1291 
1292    WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0);
1293 
1294    WRITE(REG_A6XX_SP_UNKNOWN_B183, 0);
1295 
1296    WRITE(REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
1297    WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0);
1298    WRITE(REG_A6XX_GRAS_SC_CNTL, A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1299    WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1300    WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
1301    WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
1302    WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
1303    WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
1304    WRITE(REG_A6XX_SP_TP_SAMPLE_CONFIG, 0);
1305    /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_MODE_CNTL
1306     * but this seems to kill texture gather offsets.
1307     */
1308    WRITE(REG_A6XX_SP_TP_MODE_CNTL, 0xa0 |
1309          A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
1310    WRITE(REG_A6XX_RB_SAMPLE_CONFIG, 0);
1311    WRITE(REG_A6XX_GRAS_SAMPLE_CONFIG, 0);
1312    WRITE(REG_A6XX_RB_Z_BOUNDS_MIN, 0);
1313    WRITE(REG_A6XX_RB_Z_BOUNDS_MAX, 0);
1314    WRITE(REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1315 
1316    emit_marker6(ring, 7);
1317 
1318    OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
1319    OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */
1320 
1321    WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0);
1322 
1323    OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1);
1324    OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */
1325 
1326    /* Clear any potential pending state groups to be safe: */
1327    OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1328    OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1329                      CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1330                      CP_SET_DRAW_STATE__0_GROUP_ID(0));
1331    OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1332    OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1333 
1334    OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1);
1335    OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */
1336 
1337    OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1338    OUT_RING(ring, 0x00000000);
1339 
1340    OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
1341    OUT_RING(ring, 0x00000000);
1342 
1343    if (!batch->nondraw) {
1344       trace_end_state_restore(&batch->trace, ring);
1345    }
1346 }
1347 
1348 static void
fd6_mem_to_mem(struct fd_ringbuffer * ring,struct pipe_resource * dst,unsigned dst_off,struct pipe_resource * src,unsigned src_off,unsigned sizedwords)1349 fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
1350                unsigned dst_off, struct pipe_resource *src, unsigned src_off,
1351                unsigned sizedwords)
1352 {
1353    struct fd_bo *src_bo = fd_resource(src)->bo;
1354    struct fd_bo *dst_bo = fd_resource(dst)->bo;
1355    unsigned i;
1356 
1357    for (i = 0; i < sizedwords; i++) {
1358       OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
1359       OUT_RING(ring, 0x00000000);
1360       OUT_RELOC(ring, dst_bo, dst_off, 0, 0);
1361       OUT_RELOC(ring, src_bo, src_off, 0, 0);
1362 
1363       dst_off += 4;
1364       src_off += 4;
1365    }
1366 }
1367 
1368 /* this is *almost* the same as fd6_cache_flush().. which I guess
1369  * could be re-worked to be something a bit more generic w/ param
1370  * indicating what needs to be flushed..  although that would mean
1371  * figuring out which events trigger what state to flush..
1372  */
1373 static void
fd6_framebuffer_barrier(struct fd_context * ctx)1374 fd6_framebuffer_barrier(struct fd_context *ctx) assert_dt
1375 {
1376    struct fd6_context *fd6_ctx = fd6_context(ctx);
1377    struct fd_batch *batch = fd_context_batch_locked(ctx);
1378    struct fd_ringbuffer *ring = batch->draw;
1379    unsigned seqno;
1380 
1381    fd_batch_needs_flush(batch);
1382 
1383    seqno = fd6_event_write(batch, ring, RB_DONE_TS, true);
1384 
1385    OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
1386    OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
1387                      CP_WAIT_REG_MEM_0_POLL_MEMORY);
1388    OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
1389    OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno));
1390    OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0));
1391    OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
1392 
1393    fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true);
1394    fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true);
1395 
1396    seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
1397    fd_wfi(batch, ring);
1398 
1399    fd6_event_write(batch, ring, 0x31, false);
1400 
1401    OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4);
1402    OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0));
1403    OUT_RELOC(ring, control_ptr(fd6_ctx, seqno));
1404    OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno));
1405 
1406    fd_batch_unlock_submit(batch);
1407    fd_batch_reference(&batch, NULL);
1408 }
1409 
1410 void
fd6_emit_init_screen(struct pipe_screen * pscreen)1411 fd6_emit_init_screen(struct pipe_screen *pscreen)
1412 {
1413    struct fd_screen *screen = fd_screen(pscreen);
1414    screen->emit_ib = fd6_emit_ib;
1415    screen->mem_to_mem = fd6_mem_to_mem;
1416 }
1417 
1418 void
fd6_emit_init(struct pipe_context * pctx)1419 fd6_emit_init(struct pipe_context *pctx) disable_thread_safety_analysis
1420 {
1421    struct fd_context *ctx = fd_context(pctx);
1422    ctx->framebuffer_barrier = fd6_framebuffer_barrier;
1423 }
1424