1 /*
2  * Copyright © 2021 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "gen_macros.h"
25 
26 #include "nir/nir_builder.h"
27 #include "pan_encoder.h"
28 #include "pan_shader.h"
29 
30 #include "panvk_private.h"
31 
32 static mali_ptr
panvk_meta_copy_img_emit_texture(struct panfrost_device * pdev,struct pan_pool * desc_pool,const struct pan_image_view * view)33 panvk_meta_copy_img_emit_texture(struct panfrost_device *pdev,
34                                  struct pan_pool *desc_pool,
35                                  const struct pan_image_view *view)
36 {
37 #if PAN_ARCH >= 6
38    struct panfrost_ptr texture =
39       pan_pool_alloc_desc(desc_pool, TEXTURE);
40    size_t payload_size =
41       GENX(panfrost_estimate_texture_payload_size)(view);
42    struct panfrost_ptr surfaces =
43       pan_pool_alloc_aligned(desc_pool, payload_size,
44                              pan_alignment(SURFACE_WITH_STRIDE));
45 
46    GENX(panfrost_new_texture)(pdev, view, texture.cpu, &surfaces);
47 
48    return texture.gpu;
49 #else
50    size_t sz = pan_size(TEXTURE) +
51                GENX(panfrost_estimate_texture_payload_size)(view);
52    struct panfrost_ptr texture =
53       pan_pool_alloc_aligned(desc_pool, sz, pan_alignment(TEXTURE));
54    struct panfrost_ptr surfaces = {
55       .cpu = texture.cpu + pan_size(TEXTURE),
56       .gpu = texture.gpu + pan_size(TEXTURE),
57    };
58 
59    GENX(panfrost_new_texture)(pdev, view, texture.cpu, &surfaces);
60 
61    return pan_pool_upload_aligned(desc_pool, &texture.gpu,
62                                   sizeof(mali_ptr),
63                                   sizeof(mali_ptr));
64 #endif
65 }
66 
67 static mali_ptr
panvk_meta_copy_img_emit_sampler(struct panfrost_device * pdev,struct pan_pool * desc_pool)68 panvk_meta_copy_img_emit_sampler(struct panfrost_device *pdev,
69                                  struct pan_pool *desc_pool)
70 {
71    struct panfrost_ptr sampler =
72       pan_pool_alloc_desc(desc_pool, SAMPLER);
73 
74    pan_pack(sampler.cpu, SAMPLER, cfg) {
75 #if PAN_ARCH >= 6
76       cfg.seamless_cube_map = false;
77 #endif
78       cfg.normalized_coordinates = false;
79       cfg.minify_nearest = true;
80       cfg.magnify_nearest = true;
81    }
82 
83    return sampler.gpu;
84 }
85 
86 static void
panvk_meta_copy_emit_varying(struct pan_pool * pool,mali_ptr coordinates,mali_ptr * varying_bufs,mali_ptr * varyings)87 panvk_meta_copy_emit_varying(struct pan_pool *pool,
88                              mali_ptr coordinates,
89                              mali_ptr *varying_bufs,
90                              mali_ptr *varyings)
91 {
92    /* Bifrost needs an empty desc to mark end of prefetching */
93    bool padding_buffer = PAN_ARCH >= 6;
94 
95    struct panfrost_ptr varying =
96       pan_pool_alloc_desc(pool, ATTRIBUTE);
97    struct panfrost_ptr varying_buffer =
98       pan_pool_alloc_desc_array(pool, (padding_buffer ? 2 : 1),
99                                      ATTRIBUTE_BUFFER);
100 
101    pan_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) {
102       cfg.pointer = coordinates;
103       cfg.stride = 4 * sizeof(uint32_t);
104       cfg.size = cfg.stride * 4;
105    }
106 
107    if (padding_buffer) {
108       pan_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER),
109                ATTRIBUTE_BUFFER, cfg);
110    }
111 
112    pan_pack(varying.cpu, ATTRIBUTE, cfg) {
113       cfg.buffer_index = 0;
114       cfg.offset_enable = PAN_ARCH <= 5;
115       cfg.format = pool->dev->formats[PIPE_FORMAT_R32G32B32_FLOAT].hw;
116    }
117 
118    *varyings = varying.gpu;
119    *varying_bufs = varying_buffer.gpu;
120 }
121 
122 static void
panvk_meta_copy_emit_dcd(struct pan_pool * pool,mali_ptr src_coords,mali_ptr dst_coords,mali_ptr texture,mali_ptr sampler,mali_ptr vpd,mali_ptr tsd,mali_ptr rsd,mali_ptr ubos,mali_ptr push_constants,void * out)123 panvk_meta_copy_emit_dcd(struct pan_pool *pool,
124                          mali_ptr src_coords, mali_ptr dst_coords,
125                          mali_ptr texture, mali_ptr sampler,
126                          mali_ptr vpd, mali_ptr tsd, mali_ptr rsd,
127                          mali_ptr ubos, mali_ptr push_constants,
128                          void *out)
129 {
130    pan_pack(out, DRAW, cfg) {
131       cfg.thread_storage = tsd;
132       cfg.state = rsd;
133       cfg.uniform_buffers = ubos;
134       cfg.push_uniforms = push_constants;
135       cfg.position = dst_coords;
136       if (src_coords) {
137               panvk_meta_copy_emit_varying(pool, src_coords,
138                                            &cfg.varying_buffers,
139                                            &cfg.varyings);
140       }
141       cfg.viewport = vpd;
142       cfg.textures = texture;
143       cfg.samplers = sampler;
144    }
145 }
146 
147 static struct panfrost_ptr
panvk_meta_copy_emit_tiler_job(struct pan_pool * desc_pool,struct pan_scoreboard * scoreboard,mali_ptr src_coords,mali_ptr dst_coords,mali_ptr texture,mali_ptr sampler,mali_ptr ubo,mali_ptr push_constants,mali_ptr vpd,mali_ptr rsd,mali_ptr tsd,mali_ptr tiler)148 panvk_meta_copy_emit_tiler_job(struct pan_pool *desc_pool,
149                                struct pan_scoreboard *scoreboard,
150                                mali_ptr src_coords, mali_ptr dst_coords,
151                                mali_ptr texture, mali_ptr sampler,
152                                mali_ptr ubo, mali_ptr push_constants,
153                                mali_ptr vpd, mali_ptr rsd,
154                                mali_ptr tsd, mali_ptr tiler)
155 {
156    struct panfrost_ptr job =
157       pan_pool_alloc_desc(desc_pool, TILER_JOB);
158 
159    panvk_meta_copy_emit_dcd(desc_pool, src_coords, dst_coords,
160                             texture, sampler, vpd, tsd, rsd, ubo, push_constants,
161                             pan_section_ptr(job.cpu, TILER_JOB, DRAW));
162 
163    pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) {
164       cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP;
165       cfg.index_count = 4;
166       cfg.job_task_split = 6;
167    }
168 
169    pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) {
170       cfg.constant = 1.0f;
171    }
172 
173    void *invoc = pan_section_ptr(job.cpu,
174                                  TILER_JOB,
175                                  INVOCATION);
176    panfrost_pack_work_groups_compute(invoc, 1, 4,
177                                      1, 1, 1, 1, true, false);
178 
179 #if PAN_ARCH >= 6
180    pan_section_pack(job.cpu, TILER_JOB, PADDING, cfg);
181    pan_section_pack(job.cpu, TILER_JOB, TILER, cfg) {
182       cfg.address = tiler;
183    }
184 #endif
185 
186    panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_TILER,
187                     false, false, 0, 0, &job, false);
188    return job;
189 }
190 
191 static struct panfrost_ptr
panvk_meta_copy_emit_compute_job(struct pan_pool * desc_pool,struct pan_scoreboard * scoreboard,const struct pan_compute_dim * num_wg,const struct pan_compute_dim * wg_sz,mali_ptr texture,mali_ptr sampler,mali_ptr ubo,mali_ptr push_constants,mali_ptr rsd,mali_ptr tsd)192 panvk_meta_copy_emit_compute_job(struct pan_pool *desc_pool,
193                                  struct pan_scoreboard *scoreboard,
194                                  const struct pan_compute_dim *num_wg,
195                                  const struct pan_compute_dim *wg_sz,
196                                  mali_ptr texture, mali_ptr sampler,
197                                  mali_ptr ubo, mali_ptr push_constants,
198                                  mali_ptr rsd, mali_ptr tsd)
199 {
200    struct panfrost_ptr job =
201       pan_pool_alloc_desc(desc_pool, COMPUTE_JOB);
202 
203    void *invoc = pan_section_ptr(job.cpu,
204                                  COMPUTE_JOB,
205                                  INVOCATION);
206    panfrost_pack_work_groups_compute(invoc, num_wg->x, num_wg->y, num_wg->z,
207                                      wg_sz->x, wg_sz->y, wg_sz->z,
208                                      false, false);
209 
210    pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
211       cfg.job_task_split = 8;
212    }
213 
214    panvk_meta_copy_emit_dcd(desc_pool, 0, 0, texture, sampler,
215                             0, tsd, rsd, ubo, push_constants,
216                             pan_section_ptr(job.cpu, COMPUTE_JOB, DRAW));
217 
218    panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
219                     false, false, 0, 0, &job, false);
220    return job;
221 }
222 
223 
224 #if PAN_ARCH >= 6
225 static uint32_t
panvk_meta_copy_img_bifrost_raw_format(unsigned texelsize)226 panvk_meta_copy_img_bifrost_raw_format(unsigned texelsize)
227 {
228    switch (texelsize) {
229    case 6: return MALI_RGB16UI << 12;
230    case 8: return MALI_RG32UI << 12;
231    case 12: return MALI_RGB32UI << 12;
232    case 16: return MALI_RGBA32UI << 12;
233    default: unreachable("Invalid texel size\n");
234    }
235 }
236 #endif
237 
238 static mali_ptr
panvk_meta_copy_to_img_emit_rsd(struct panfrost_device * pdev,struct pan_pool * desc_pool,mali_ptr shader,const struct pan_shader_info * shader_info,enum pipe_format fmt,unsigned wrmask,bool from_img)239 panvk_meta_copy_to_img_emit_rsd(struct panfrost_device *pdev,
240                                 struct pan_pool *desc_pool,
241                                 mali_ptr shader,
242                                 const struct pan_shader_info *shader_info,
243                                 enum pipe_format fmt, unsigned wrmask,
244                                 bool from_img)
245 {
246    struct panfrost_ptr rsd_ptr =
247       pan_pool_alloc_desc_aggregate(desc_pool,
248                                     PAN_DESC(RENDERER_STATE),
249                                     PAN_DESC_ARRAY(1, BLEND));
250 
251    bool raw = util_format_get_blocksize(fmt) > 4;
252    unsigned fullmask = (1 << util_format_get_nr_components(fmt)) - 1;
253    bool partialwrite = fullmask != wrmask && !raw;
254    bool readstb = fullmask != wrmask && raw;
255 
256    pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
257       pan_shader_prepare_rsd(shader_info, shader, &cfg);
258       if (from_img) {
259          cfg.shader.varying_count = 1;
260          cfg.shader.texture_count = 1;
261          cfg.shader.sampler_count = 1;
262       }
263       cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
264       cfg.multisample_misc.sample_mask = UINT16_MAX;
265       cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS;
266       cfg.stencil_mask_misc.stencil_mask_front = 0xFF;
267       cfg.stencil_mask_misc.stencil_mask_back = 0xFF;
268       cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS;
269       cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE;
270       cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE;
271       cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE;
272       cfg.stencil_front.mask = 0xFF;
273       cfg.stencil_back = cfg.stencil_front;
274 
275 #if PAN_ARCH >= 6
276       cfg.properties.allow_forward_pixel_to_be_killed = true;
277       cfg.properties.allow_forward_pixel_to_kill =
278          !partialwrite && !readstb;
279       cfg.properties.zs_update_operation =
280          MALI_PIXEL_KILL_STRONG_EARLY;
281       cfg.properties.pixel_kill_operation =
282          MALI_PIXEL_KILL_FORCE_EARLY;
283 #else
284       cfg.properties.shader_reads_tilebuffer = readstb;
285       cfg.properties.work_register_count = shader_info->work_reg_count;
286       cfg.properties.force_early_z = true;
287       cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS;
288 #endif
289    }
290 
291    pan_pack(rsd_ptr.cpu + pan_size(RENDERER_STATE), BLEND, cfg) {
292       cfg.round_to_fb_precision = true;
293       cfg.load_destination = partialwrite;
294       cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
295       cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
296       cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
297       cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
298       cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
299       cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
300 #if PAN_ARCH >= 6
301       cfg.internal.mode =
302          partialwrite ?
303          MALI_BLEND_MODE_FIXED_FUNCTION :
304          MALI_BLEND_MODE_OPAQUE;
305       cfg.equation.color_mask = partialwrite ? wrmask : 0xf;
306       cfg.internal.fixed_function.num_comps = 4;
307       if (!raw) {
308          cfg.internal.fixed_function.conversion.memory_format =
309             panfrost_format_to_bifrost_blend(pdev, fmt, false);
310          cfg.internal.fixed_function.conversion.register_format =
311             MALI_REGISTER_FILE_FORMAT_F32;
312       } else {
313          unsigned imgtexelsz = util_format_get_blocksize(fmt);
314 
315          cfg.internal.fixed_function.conversion.memory_format =
316             panvk_meta_copy_img_bifrost_raw_format(imgtexelsz);
317          cfg.internal.fixed_function.conversion.register_format =
318             (imgtexelsz & 2) ?
319             MALI_REGISTER_FILE_FORMAT_U16 :
320             MALI_REGISTER_FILE_FORMAT_U32;
321       }
322 #else
323       cfg.equation.color_mask = wrmask;
324 #endif
325    }
326 
327    return rsd_ptr.gpu;
328 }
329 
330 static mali_ptr
panvk_meta_copy_emit_ubo(struct panfrost_device * pdev,struct pan_pool * pool,void * data,unsigned size)331 panvk_meta_copy_emit_ubo(struct panfrost_device *pdev,
332                          struct pan_pool *pool,
333                          void *data, unsigned size)
334 {
335    struct panfrost_ptr ubo = pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
336 
337    pan_pack(ubo.cpu, UNIFORM_BUFFER, cfg) {
338       cfg.entries = DIV_ROUND_UP(size, 16);
339       cfg.pointer = pan_pool_upload_aligned(pool, data, size, 16);
340    }
341 
342    return ubo.gpu;
343 }
344 
345 static mali_ptr
panvk_meta_copy_emit_push_constants(struct panfrost_device * pdev,const struct panfrost_ubo_push * pushmap,struct pan_pool * pool,const void * data,unsigned size)346 panvk_meta_copy_emit_push_constants(struct panfrost_device *pdev,
347                                     const struct panfrost_ubo_push *pushmap,
348                                     struct pan_pool *pool,
349                                     const void *data, unsigned size)
350 {
351    assert(pushmap->count <= (size / 4));
352 
353    const uint32_t *in = data;
354    uint32_t pushvals[PAN_MAX_PUSH];
355 
356    for (unsigned i = 0; i < pushmap->count; i++) {
357       assert(i < ARRAY_SIZE(pushvals));
358       assert(pushmap->words[i].ubo == 0);
359       assert(pushmap->words[i].offset < size);
360       pushvals[i] = in[pushmap->words[i].offset / 4];
361    }
362 
363    return pan_pool_upload_aligned(pool, pushvals, size, 16);
364 }
365 
366 static mali_ptr
panvk_meta_copy_to_buf_emit_rsd(struct panfrost_device * pdev,struct pan_pool * desc_pool,mali_ptr shader,const struct pan_shader_info * shader_info,bool from_img)367 panvk_meta_copy_to_buf_emit_rsd(struct panfrost_device *pdev,
368                                 struct pan_pool *desc_pool,
369                                 mali_ptr shader,
370                                 const struct pan_shader_info *shader_info,
371                                 bool from_img)
372 {
373    struct panfrost_ptr rsd_ptr =
374       pan_pool_alloc_desc_aggregate(desc_pool,
375                                     PAN_DESC(RENDERER_STATE));
376 
377    pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
378       pan_shader_prepare_rsd(shader_info, shader, &cfg);
379       if (from_img) {
380          cfg.shader.texture_count = 1;
381          cfg.shader.sampler_count = 1;
382       }
383    }
384 
385    return rsd_ptr.gpu;
386 }
387 
388 static mali_ptr
panvk_meta_copy_img2img_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,enum pipe_format srcfmt,enum pipe_format dstfmt,unsigned dstmask,unsigned texdim,bool texisarray,bool is_ms,struct pan_shader_info * shader_info)389 panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
390                                struct pan_pool *bin_pool,
391                                enum pipe_format srcfmt,
392                                enum pipe_format dstfmt, unsigned dstmask,
393                                unsigned texdim, bool texisarray, bool is_ms,
394                                struct pan_shader_info *shader_info)
395 {
396    nir_builder b =
397       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
398                                      GENX(pan_shader_get_compiler_options)(),
399                                      "panvk_meta_copy_img2img(srcfmt=%s,dstfmt=%s,%dD%s%s)",
400                                      util_format_name(srcfmt), util_format_name(dstfmt),
401                                      texdim, texisarray ? "[]" : "", is_ms ? ",ms" : "");
402 
403    nir_variable *coord_var =
404       nir_variable_create(b.shader, nir_var_shader_in,
405                           glsl_vector_type(GLSL_TYPE_FLOAT, texdim + texisarray),
406                           "coord");
407    coord_var->data.location = VARYING_SLOT_TEX0;
408    nir_ssa_def *coord = nir_f2u32(&b, nir_load_var(&b, coord_var));
409 
410    nir_tex_instr *tex = nir_tex_instr_create(b.shader, is_ms ? 2 : 1);
411    tex->op = is_ms ? nir_texop_txf_ms : nir_texop_txf;
412    tex->texture_index = 0;
413    tex->is_array = texisarray;
414    tex->dest_type = util_format_is_unorm(srcfmt) ?
415                     nir_type_float32 : nir_type_uint32;
416 
417    switch (texdim) {
418    case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
419    case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
420    case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
421    default: unreachable("Invalid texture dimension");
422    }
423 
424    tex->src[0].src_type = nir_tex_src_coord;
425    tex->src[0].src = nir_src_for_ssa(coord);
426    tex->coord_components = texdim + texisarray;
427 
428    if (is_ms) {
429       tex->src[1].src_type = nir_tex_src_ms_index;
430       tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(&b));
431    }
432 
433    nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
434                      nir_alu_type_get_type_size(tex->dest_type), NULL);
435    nir_builder_instr_insert(&b, &tex->instr);
436 
437    nir_ssa_def *texel = &tex->dest.ssa;
438 
439    unsigned dstcompsz =
440       util_format_get_component_bits(dstfmt, UTIL_FORMAT_COLORSPACE_RGB, 0);
441    unsigned ndstcomps = util_format_get_nr_components(dstfmt);
442    const struct glsl_type *outtype = NULL;
443 
444    if (srcfmt == PIPE_FORMAT_R5G6B5_UNORM && dstfmt == PIPE_FORMAT_R8G8_UNORM) {
445       nir_ssa_def *rgb =
446          nir_f2u32(&b, nir_fmul(&b, texel,
447                                 nir_vec3(&b,
448                                          nir_imm_float(&b, 31),
449                                          nir_imm_float(&b, 63),
450                                          nir_imm_float(&b, 31))));
451       nir_ssa_def *rg =
452          nir_vec2(&b,
453                   nir_ior(&b, nir_channel(&b, rgb, 0),
454                           nir_ishl(&b, nir_channel(&b, rgb, 1),
455                                    nir_imm_int(&b, 5))),
456                   nir_ior(&b,
457                           nir_ushr_imm(&b, nir_channel(&b, rgb, 1), 3),
458                           nir_ishl(&b, nir_channel(&b, rgb, 2),
459                                    nir_imm_int(&b, 3))));
460       rg = nir_iand_imm(&b, rg, 255);
461       texel = nir_fmul_imm(&b, nir_u2f32(&b, rg), 1.0 / 255);
462       outtype = glsl_vector_type(GLSL_TYPE_FLOAT, 2);
463    } else if (srcfmt == PIPE_FORMAT_R8G8_UNORM && dstfmt == PIPE_FORMAT_R5G6B5_UNORM) {
464       nir_ssa_def *rg = nir_f2u32(&b, nir_fmul_imm(&b, texel, 255));
465       nir_ssa_def *rgb =
466          nir_vec3(&b,
467                   nir_channel(&b, rg, 0),
468                   nir_ior(&b,
469                           nir_ushr_imm(&b, nir_channel(&b, rg, 0), 5),
470                           nir_ishl(&b, nir_channel(&b, rg, 1),
471                                    nir_imm_int(&b, 3))),
472                   nir_ushr_imm(&b, nir_channel(&b, rg, 1), 3));
473       rgb = nir_iand(&b, rgb,
474                      nir_vec3(&b,
475                               nir_imm_int(&b, 31),
476                               nir_imm_int(&b, 63),
477                               nir_imm_int(&b, 31)));
478       texel = nir_fmul(&b, nir_u2f32(&b, rgb),
479                        nir_vec3(&b,
480                                 nir_imm_float(&b, 1.0 / 31),
481                                 nir_imm_float(&b, 1.0 / 63),
482                                 nir_imm_float(&b, 1.0 / 31)));
483       outtype = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
484    } else {
485       assert(srcfmt == dstfmt);
486       enum glsl_base_type basetype;
487       if (util_format_is_unorm(dstfmt)) {
488          basetype = GLSL_TYPE_FLOAT;
489       } else if (dstcompsz == 16) {
490          basetype = GLSL_TYPE_UINT16;
491       } else {
492          assert(dstcompsz == 32);
493          basetype = GLSL_TYPE_UINT;
494       }
495 
496       if (dstcompsz == 16)
497          texel = nir_u2u16(&b, texel);
498 
499       texel = nir_channels(&b, texel, (1 << ndstcomps) - 1);
500       outtype = glsl_vector_type(basetype, ndstcomps);
501    }
502 
503    nir_variable *out =
504       nir_variable_create(b.shader, nir_var_shader_out, outtype, "out");
505    out->data.location = FRAG_RESULT_DATA0;
506 
507    unsigned fullmask = (1 << ndstcomps) - 1;
508    if (dstcompsz > 8 && dstmask != fullmask) {
509       nir_ssa_def *oldtexel = nir_load_var(&b, out);
510       nir_ssa_def *dstcomps[4];
511 
512       for (unsigned i = 0; i < ndstcomps; i++) {
513          if (dstmask & BITFIELD_BIT(i))
514             dstcomps[i] = nir_channel(&b, texel, i);
515          else
516             dstcomps[i] = nir_channel(&b, oldtexel, i);
517       }
518 
519       texel = nir_vec(&b, dstcomps, ndstcomps);
520    }
521 
522    nir_store_var(&b, out, texel, 0xff);
523 
524    struct panfrost_compile_inputs inputs = {
525       .gpu_id = pdev->gpu_id,
526       .is_blit = true,
527    };
528 
529 #if PAN_ARCH >= 6
530    pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
531       cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
532       cfg.register_format = dstcompsz == 2 ?
533                             MALI_REGISTER_FILE_FORMAT_U16 :
534                             MALI_REGISTER_FILE_FORMAT_U32;
535    }
536    inputs.bifrost.static_rt_conv = true;
537 #endif
538 
539    struct util_dynarray binary;
540 
541    util_dynarray_init(&binary, NULL);
542    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
543 
544    shader_info->fs.sample_shading = is_ms;
545 
546    mali_ptr shader =
547       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
548                               PAN_ARCH >= 6 ? 128 : 64);
549 
550    util_dynarray_fini(&binary);
551    ralloc_free(b.shader);
552 
553    return shader;
554 }
555 
556 static enum pipe_format
panvk_meta_copy_img_format(enum pipe_format fmt)557 panvk_meta_copy_img_format(enum pipe_format fmt)
558 {
559    /* We can't use a non-compressed format when handling a tiled/AFBC
560     * compressed format because the tile size differ (4x4 blocks for
561     * compressed formats and 16x16 texels for non-compressed ones).
562     */
563    assert(!util_format_is_compressed(fmt));
564 
565    /* Pick blendable formats when we can, otherwise pick the UINT variant
566     * matching the texel size.
567     */
568    switch (util_format_get_blocksize(fmt)) {
569    case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
570    case 12: return PIPE_FORMAT_R32G32B32_UINT;
571    case 8: return PIPE_FORMAT_R32G32_UINT;
572    case 6: return PIPE_FORMAT_R16G16B16_UINT;
573    case 4: return PIPE_FORMAT_R8G8B8A8_UNORM;
574    case 2: return (fmt == PIPE_FORMAT_R5G6B5_UNORM ||
575                    fmt == PIPE_FORMAT_B5G6R5_UNORM) ?
576                   PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UNORM;
577    case 1: return PIPE_FORMAT_R8_UNORM;
578    default: unreachable("Unsupported format\n");
579    }
580 }
581 
582 struct panvk_meta_copy_img2img_format_info {
583    enum pipe_format srcfmt;
584    enum pipe_format dstfmt;
585    unsigned dstmask;
586 };
587 
588 static const struct panvk_meta_copy_img2img_format_info panvk_meta_copy_img2img_fmts[] = {
589    { PIPE_FORMAT_R8_UNORM, PIPE_FORMAT_R8_UNORM, 0x1},
590    { PIPE_FORMAT_R5G6B5_UNORM, PIPE_FORMAT_R5G6B5_UNORM, 0x7},
591    { PIPE_FORMAT_R5G6B5_UNORM, PIPE_FORMAT_R8G8_UNORM, 0x3},
592    { PIPE_FORMAT_R8G8_UNORM, PIPE_FORMAT_R5G6B5_UNORM, 0x7},
593    { PIPE_FORMAT_R8G8_UNORM, PIPE_FORMAT_R8G8_UNORM, 0x3},
594    /* Z24S8(depth) */
595    { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0x7 },
596    /* Z24S8(stencil) */
597    { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0x8 },
598    { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0xf },
599    { PIPE_FORMAT_R16G16B16_UINT, PIPE_FORMAT_R16G16B16_UINT, 0x7 },
600    { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x3 },
601    /* Z32S8X24(depth) */
602    { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x1 },
603    /* Z32S8X24(stencil) */
604    { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x2 },
605    { PIPE_FORMAT_R32G32B32_UINT, PIPE_FORMAT_R32G32B32_UINT, 0x7 },
606    { PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
607 };
608 
609 static unsigned
panvk_meta_copy_img2img_format_idx(struct panvk_meta_copy_img2img_format_info key)610 panvk_meta_copy_img2img_format_idx(struct panvk_meta_copy_img2img_format_info key)
611 {
612    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2img_fmts) == PANVK_META_COPY_IMG2IMG_NUM_FORMATS);
613 
614    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2img_fmts); i++) {
615       if (!memcmp(&key, &panvk_meta_copy_img2img_fmts[i], sizeof(key)))
616          return i;
617    }
618 
619    unreachable("Invalid image format\n");
620 }
621 
622 static unsigned
panvk_meta_copy_img_mask(enum pipe_format imgfmt,VkImageAspectFlags aspectMask)623 panvk_meta_copy_img_mask(enum pipe_format imgfmt, VkImageAspectFlags aspectMask)
624 {
625    if (aspectMask != VK_IMAGE_ASPECT_DEPTH_BIT &&
626        aspectMask != VK_IMAGE_ASPECT_STENCIL_BIT) {
627       enum pipe_format outfmt = panvk_meta_copy_img_format(imgfmt);
628 
629       return (1 << util_format_get_nr_components(outfmt)) - 1;
630    }
631 
632    switch (imgfmt) {
633    case PIPE_FORMAT_S8_UINT:
634       return 1;
635    case PIPE_FORMAT_Z16_UNORM:
636       return 3;
637    case PIPE_FORMAT_Z16_UNORM_S8_UINT:
638       return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 3 : 8;
639    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
640       return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 7 : 8;
641    case PIPE_FORMAT_Z24X8_UNORM:
642       assert(aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT);
643       return 7;
644    case PIPE_FORMAT_Z32_FLOAT:
645       return 0xf;
646    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
647       return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 1 : 2;
648    default:
649       unreachable("Invalid depth format\n");
650    }
651 }
652 
653 static void
panvk_meta_copy_img2img(struct panvk_cmd_buffer * cmdbuf,const struct panvk_image * src,const struct panvk_image * dst,const VkImageCopy2 * region)654 panvk_meta_copy_img2img(struct panvk_cmd_buffer *cmdbuf,
655                         const struct panvk_image *src,
656                         const struct panvk_image *dst,
657                         const VkImageCopy2 *region)
658 {
659    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
660    struct pan_fb_info *fbinfo = &cmdbuf->state.fb.info;
661    struct panvk_meta_copy_img2img_format_info key = {
662       .srcfmt = panvk_meta_copy_img_format(src->pimage.layout.format),
663       .dstfmt = panvk_meta_copy_img_format(dst->pimage.layout.format),
664       .dstmask = panvk_meta_copy_img_mask(dst->pimage.layout.format,
665                                           region->dstSubresource.aspectMask),
666    };
667 
668    assert(src->pimage.layout.nr_samples == dst->pimage.layout.nr_samples);
669 
670    unsigned texdimidx =
671       panvk_meta_copy_tex_type(src->pimage.layout.dim,
672                                src->pimage.layout.array_size > 1);
673    unsigned fmtidx =
674       panvk_meta_copy_img2img_format_idx(key);
675    unsigned ms = dst->pimage.layout.nr_samples > 1 ? 1 : 0;
676 
677    mali_ptr rsd =
678       cmdbuf->device->physical_device->meta.copy.img2img[ms][texdimidx][fmtidx].rsd;
679 
680    struct pan_image_view srcview = {
681       .format = key.srcfmt,
682       .dim = src->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
683              MALI_TEXTURE_DIMENSION_2D : src->pimage.layout.dim,
684       .image = &src->pimage,
685       .nr_samples = src->pimage.layout.nr_samples,
686       .first_level = region->srcSubresource.mipLevel,
687       .last_level = region->srcSubresource.mipLevel,
688       .first_layer = region->srcSubresource.baseArrayLayer,
689       .last_layer = region->srcSubresource.baseArrayLayer + region->srcSubresource.layerCount - 1,
690       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
691    };
692 
693    struct pan_image_view dstview = {
694       .format = key.dstfmt,
695       .dim = MALI_TEXTURE_DIMENSION_2D,
696       .image = &dst->pimage,
697       .nr_samples = dst->pimage.layout.nr_samples,
698       .first_level = region->dstSubresource.mipLevel,
699       .last_level = region->dstSubresource.mipLevel,
700       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
701    };
702 
703    unsigned minx = MAX2(region->dstOffset.x, 0);
704    unsigned miny = MAX2(region->dstOffset.y, 0);
705    unsigned maxx = MAX2(region->dstOffset.x + region->extent.width - 1, 0);
706    unsigned maxy = MAX2(region->dstOffset.y + region->extent.height - 1, 0);
707 
708    mali_ptr vpd =
709       panvk_per_arch(meta_emit_viewport)(&cmdbuf->desc_pool.base,
710                                          minx, miny, maxx, maxy);
711 
712    float dst_rect[] = {
713       minx, miny, 0.0, 1.0,
714       maxx + 1, miny, 0.0, 1.0,
715       minx, maxy + 1, 0.0, 1.0,
716       maxx + 1, maxy + 1, 0.0, 1.0,
717    };
718 
719    mali_ptr dst_coords =
720       pan_pool_upload_aligned(&cmdbuf->desc_pool.base, dst_rect,
721                               sizeof(dst_rect), 64);
722 
723    /* TODO: don't force preloads of dst resources if unneeded */
724 
725    unsigned width = u_minify(dst->pimage.layout.width, region->dstSubresource.mipLevel);
726    unsigned height = u_minify(dst->pimage.layout.height, region->dstSubresource.mipLevel);
727    cmdbuf->state.fb.crc_valid[0] = false;
728    *fbinfo = (struct pan_fb_info){
729       .width = width,
730       .height = height,
731       .extent.minx = minx & ~31,
732       .extent.miny = miny & ~31,
733       .extent.maxx = MIN2(ALIGN_POT(maxx + 1, 32), width) - 1,
734       .extent.maxy = MIN2(ALIGN_POT(maxy + 1, 32), height) - 1,
735       .nr_samples = dst->pimage.layout.nr_samples,
736       .rt_count = 1,
737       .rts[0].view = &dstview,
738       .rts[0].preload = true,
739       .rts[0].crc_valid = &cmdbuf->state.fb.crc_valid[0],
740    };
741 
742    mali_ptr texture =
743       panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &srcview);
744    mali_ptr sampler =
745       panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
746 
747    panvk_per_arch(cmd_close_batch)(cmdbuf);
748 
749    minx = MAX2(region->srcOffset.x, 0);
750    miny = MAX2(region->srcOffset.y, 0);
751    maxx = MAX2(region->srcOffset.x + region->extent.width - 1, 0);
752    maxy = MAX2(region->srcOffset.y + region->extent.height - 1, 0);
753    assert(region->dstOffset.z >= 0);
754 
755    unsigned first_src_layer = MAX2(0, region->srcOffset.z);
756    unsigned first_dst_layer = MAX2(region->dstSubresource.baseArrayLayer, region->dstOffset.z);
757    unsigned nlayers = MAX2(region->dstSubresource.layerCount, region->extent.depth);
758    for (unsigned l = 0; l < nlayers; l++) {
759       unsigned src_l = l + first_src_layer;
760       float src_rect[] = {
761          minx, miny, src_l, 1.0,
762          maxx + 1, miny, src_l, 1.0,
763          minx, maxy + 1, src_l, 1.0,
764          maxx + 1, maxy + 1, src_l, 1.0,
765       };
766 
767       mali_ptr src_coords =
768          pan_pool_upload_aligned(&cmdbuf->desc_pool.base, src_rect,
769                                  sizeof(src_rect), 64);
770 
771       struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
772 
773       dstview.first_layer = dstview.last_layer = l + first_dst_layer;
774       batch->blit.src = src->pimage.data.bo;
775       batch->blit.dst = dst->pimage.data.bo;
776       panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
777       panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
778       panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
779 
780       mali_ptr tsd, tiler;
781 
782 #if PAN_ARCH >= 6
783       tsd = batch->tls.gpu;
784       tiler = batch->tiler.descs.gpu;
785 #else
786       tsd = batch->fb.desc.gpu;
787       tiler = 0;
788 #endif
789 
790       struct panfrost_ptr job;
791 
792       job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base,
793                                            &batch->scoreboard,
794                                            src_coords, dst_coords,
795                                            texture, sampler, 0, 0,
796                                            vpd, rsd, tsd, tiler);
797 
798       util_dynarray_append(&batch->jobs, void *, job.cpu);
799       panvk_per_arch(cmd_close_batch)(cmdbuf);
800    }
801 }
802 
803 static void
panvk_meta_copy_img2img_init(struct panvk_physical_device * dev,bool is_ms)804 panvk_meta_copy_img2img_init(struct panvk_physical_device *dev, bool is_ms)
805 {
806    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2img_fmts) == PANVK_META_COPY_IMG2IMG_NUM_FORMATS);
807 
808    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2img_fmts); i++) {
809       for (unsigned texdim = 1; texdim <= 3; texdim++) {
810          unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
811          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2img[0]));
812 
813          /* No MSAA on 3D textures */
814          if (texdim == 3 && is_ms) continue;
815 
816          struct pan_shader_info shader_info;
817          mali_ptr shader =
818             panvk_meta_copy_img2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
819                                            panvk_meta_copy_img2img_fmts[i].srcfmt,
820                                            panvk_meta_copy_img2img_fmts[i].dstfmt,
821                                            panvk_meta_copy_img2img_fmts[i].dstmask,
822                                            texdim, false, is_ms, &shader_info);
823          dev->meta.copy.img2img[is_ms][texdimidx][i].rsd =
824             panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
825                                             shader, &shader_info,
826                                             panvk_meta_copy_img2img_fmts[i].dstfmt,
827                                             panvk_meta_copy_img2img_fmts[i].dstmask,
828                                             true);
829          if (texdim == 3)
830             continue;
831 
832          memset(&shader_info, 0, sizeof(shader_info));
833          texdimidx = panvk_meta_copy_tex_type(texdim, true);
834          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2img[0]));
835          shader =
836             panvk_meta_copy_img2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
837                                            panvk_meta_copy_img2img_fmts[i].srcfmt,
838                                            panvk_meta_copy_img2img_fmts[i].dstfmt,
839                                            panvk_meta_copy_img2img_fmts[i].dstmask,
840                                            texdim, true, is_ms, &shader_info);
841          dev->meta.copy.img2img[is_ms][texdimidx][i].rsd =
842             panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
843                                             shader, &shader_info,
844                                             panvk_meta_copy_img2img_fmts[i].dstfmt,
845                                             panvk_meta_copy_img2img_fmts[i].dstmask,
846                                             true);
847       }
848    }
849 }
850 
851 void
panvk_per_arch(CmdCopyImage2)852 panvk_per_arch(CmdCopyImage2)(VkCommandBuffer commandBuffer,
853                               const VkCopyImageInfo2 *pCopyImageInfo)
854 {
855    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
856    VK_FROM_HANDLE(panvk_image, dst, pCopyImageInfo->dstImage);
857    VK_FROM_HANDLE(panvk_image, src, pCopyImageInfo->srcImage);
858 
859    for (unsigned i = 0; i < pCopyImageInfo->regionCount; i++) {
860       panvk_meta_copy_img2img(cmdbuf, src, dst, &pCopyImageInfo->pRegions[i]);
861    }
862 }
863 
864 static unsigned
panvk_meta_copy_buf_texelsize(enum pipe_format imgfmt,unsigned mask)865 panvk_meta_copy_buf_texelsize(enum pipe_format imgfmt, unsigned mask)
866 {
867    unsigned imgtexelsz = util_format_get_blocksize(imgfmt);
868    unsigned nbufcomps = util_bitcount(mask);
869 
870    if (nbufcomps == util_format_get_nr_components(imgfmt))
871       return imgtexelsz;
872 
873    /* Special case for Z24 buffers which are not tightly packed */
874    if (mask == 7 && imgtexelsz == 4)
875       return 4;
876 
877    /* Special case for S8 extraction from Z32_S8X24 */
878    if (mask == 2 && imgtexelsz == 8)
879       return 1;
880 
881    unsigned compsz =
882       util_format_get_component_bits(imgfmt, UTIL_FORMAT_COLORSPACE_RGB, 0);
883 
884    assert(!(compsz % 8));
885 
886    return nbufcomps * compsz / 8;
887 }
888 
889 static enum pipe_format
panvk_meta_copy_buf2img_format(enum pipe_format imgfmt)890 panvk_meta_copy_buf2img_format(enum pipe_format imgfmt)
891 {
892    /* Pick blendable formats when we can, and the FLOAT variant matching the
893     * texelsize otherwise.
894     */
895    switch (util_format_get_blocksize(imgfmt)) {
896    case 1: return PIPE_FORMAT_R8_UNORM;
897    /* AFBC stores things differently for RGB565,
898     * we can't simply map to R8G8 in that case */
899    case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
900                    imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
901                   PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UNORM;
902    case 4: return PIPE_FORMAT_R8G8B8A8_UNORM;
903    case 6: return PIPE_FORMAT_R16G16B16_UINT;
904    case 8: return PIPE_FORMAT_R32G32_UINT;
905    case 12: return PIPE_FORMAT_R32G32B32_UINT;
906    case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
907    default: unreachable("Invalid format\n");
908    }
909 }
910 
911 struct panvk_meta_copy_format_info {
912    enum pipe_format imgfmt;
913    unsigned mask;
914 };
915 
916 static const struct panvk_meta_copy_format_info panvk_meta_copy_buf2img_fmts[] = {
917    { PIPE_FORMAT_R8_UNORM, 0x1 },
918    { PIPE_FORMAT_R8G8_UNORM, 0x3 },
919    { PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
920    { PIPE_FORMAT_R8G8B8A8_UNORM, 0xf },
921    { PIPE_FORMAT_R16G16B16_UINT, 0x7 },
922    { PIPE_FORMAT_R32G32_UINT, 0x3 },
923    { PIPE_FORMAT_R32G32B32_UINT, 0x7 },
924    { PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
925    /* S8 -> Z24S8 */
926    { PIPE_FORMAT_R8G8B8A8_UNORM, 0x8 },
927    /* S8 -> Z32_S8X24 */
928    { PIPE_FORMAT_R32G32_UINT, 0x2 },
929    /* Z24X8 -> Z24S8 */
930    { PIPE_FORMAT_R8G8B8A8_UNORM, 0x7 },
931    /* Z32 -> Z32_S8X24 */
932    { PIPE_FORMAT_R32G32_UINT, 0x1 },
933 };
934 
935 struct panvk_meta_copy_buf2img_info {
936    struct {
937       mali_ptr ptr;
938       struct {
939          unsigned line;
940          unsigned surf;
941       } stride;
942    } buf;
943 };
944 
945 #define panvk_meta_copy_buf2img_get_info_field(b, field) \
946         nir_load_ubo((b), 1, \
947                      sizeof(((struct panvk_meta_copy_buf2img_info *)0)->field) * 8, \
948                      nir_imm_int(b, 0), \
949                      nir_imm_int(b, offsetof(struct panvk_meta_copy_buf2img_info, field)), \
950                      .align_mul = 4, \
951                      .align_offset = 0, \
952                      .range_base = 0, \
953                      .range = ~0)
954 
955 static mali_ptr
panvk_meta_copy_buf2img_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct panvk_meta_copy_format_info key,struct pan_shader_info * shader_info)956 panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev,
957                                struct pan_pool *bin_pool,
958                                struct panvk_meta_copy_format_info key,
959                                struct pan_shader_info *shader_info)
960 {
961    nir_builder b =
962       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
963                                      GENX(pan_shader_get_compiler_options)(),
964                                      "panvk_meta_copy_buf2img(imgfmt=%s,mask=%x)",
965                                      util_format_name(key.imgfmt),
966                                      key.mask);
967 
968    b.shader->info.num_ubos = 1;
969 
970    nir_variable *coord_var =
971       nir_variable_create(b.shader, nir_var_shader_in,
972                           glsl_vector_type(GLSL_TYPE_FLOAT, 3),
973                           "coord");
974    coord_var->data.location = VARYING_SLOT_TEX0;
975    nir_ssa_def *coord = nir_load_var(&b, coord_var);
976 
977    coord = nir_f2u32(&b, coord);
978 
979    nir_ssa_def *bufptr =
980       panvk_meta_copy_buf2img_get_info_field(&b, buf.ptr);
981    nir_ssa_def *buflinestride =
982       panvk_meta_copy_buf2img_get_info_field(&b, buf.stride.line);
983    nir_ssa_def *bufsurfstride =
984       panvk_meta_copy_buf2img_get_info_field(&b, buf.stride.surf);
985 
986    unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
987    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
988    unsigned writemask = key.mask;
989 
990    nir_ssa_def *offset =
991       nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
992    offset = nir_iadd(&b, offset,
993                      nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
994    offset = nir_iadd(&b, offset,
995                      nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
996    bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
997 
998    unsigned imgcompsz =
999       (imgtexelsz <= 4 && key.imgfmt != PIPE_FORMAT_R5G6B5_UNORM) ?
1000       1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
1001 
1002    unsigned nimgcomps = imgtexelsz / imgcompsz;
1003    unsigned bufcompsz = MIN2(buftexelsz, imgcompsz);
1004    unsigned nbufcomps = buftexelsz / bufcompsz;
1005 
1006    assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
1007    assert(nbufcomps <= 4 && nimgcomps <= 4);
1008 
1009    nir_ssa_def *texel =
1010       nir_load_global(&b, bufptr, bufcompsz, nbufcomps, bufcompsz * 8);
1011 
1012    enum glsl_base_type basetype;
1013    if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
1014       texel = nir_vec3(&b,
1015                        nir_iand_imm(&b, texel, BITFIELD_MASK(5)),
1016                        nir_iand_imm(&b, nir_ushr_imm(&b, texel, 5), BITFIELD_MASK(6)),
1017                        nir_iand_imm(&b, nir_ushr_imm(&b, texel, 11), BITFIELD_MASK(5)));
1018       texel = nir_fmul(&b,
1019                        nir_u2f32(&b, texel),
1020                        nir_vec3(&b,
1021                                 nir_imm_float(&b, 1.0f / 31),
1022                                 nir_imm_float(&b, 1.0f / 63),
1023                                 nir_imm_float(&b, 1.0f / 31)));
1024       nimgcomps = 3;
1025       basetype = GLSL_TYPE_FLOAT;
1026    } else if (imgcompsz == 1) {
1027       assert(bufcompsz == 1);
1028       /* Blendable formats are unorm and the fixed-function blend unit
1029        * takes float values.
1030        */
1031       texel = nir_fmul(&b, nir_u2f32(&b, texel),
1032                        nir_imm_float(&b, 1.0f / 255));
1033       basetype = GLSL_TYPE_FLOAT;
1034    } else {
1035       texel = nir_u2uN(&b, texel, imgcompsz * 8);
1036       basetype = imgcompsz == 2 ? GLSL_TYPE_UINT16 : GLSL_TYPE_UINT;
1037    }
1038 
1039    /* We always pass the texel using 32-bit regs for now */
1040    nir_variable *out =
1041       nir_variable_create(b.shader, nir_var_shader_out,
1042                           glsl_vector_type(basetype, nimgcomps),
1043                           "out");
1044    out->data.location = FRAG_RESULT_DATA0;
1045 
1046    uint16_t fullmask = (1 << nimgcomps) - 1;
1047 
1048    assert(fullmask >= writemask);
1049 
1050    if (fullmask != writemask) {
1051       unsigned first_written_comp = ffs(writemask) - 1;
1052       nir_ssa_def *oldtexel = NULL;
1053       if (imgcompsz > 1)
1054          oldtexel = nir_load_var(&b, out);
1055 
1056       nir_ssa_def *texel_comps[4];
1057       for (unsigned i = 0; i < nimgcomps; i++) {
1058          if (writemask & BITFIELD_BIT(i))
1059             texel_comps[i] = nir_channel(&b, texel, i - first_written_comp);
1060          else if (imgcompsz > 1)
1061             texel_comps[i] = nir_channel(&b, oldtexel, i);
1062          else
1063             texel_comps[i] = nir_imm_intN_t(&b, 0, texel->bit_size);
1064       }
1065 
1066       texel = nir_vec(&b, texel_comps, nimgcomps);
1067    }
1068 
1069    nir_store_var(&b, out, texel, 0xff);
1070 
1071    struct panfrost_compile_inputs inputs = {
1072       .gpu_id = pdev->gpu_id,
1073       .is_blit = true,
1074    };
1075 
1076 #if PAN_ARCH >= 6
1077    pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
1078       cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
1079       cfg.register_format = imgcompsz == 2 ?
1080                             MALI_REGISTER_FILE_FORMAT_U16 :
1081                             MALI_REGISTER_FILE_FORMAT_U32;
1082    }
1083    inputs.bifrost.static_rt_conv = true;
1084 #endif
1085 
1086    struct util_dynarray binary;
1087 
1088    util_dynarray_init(&binary, NULL);
1089    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1090 
1091    /* Make sure UBO words have been upgraded to push constants */
1092    assert(shader_info->ubo_count == 1);
1093 
1094    mali_ptr shader =
1095       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1096                               PAN_ARCH >= 6 ? 128 : 64);
1097 
1098    util_dynarray_fini(&binary);
1099    ralloc_free(b.shader);
1100 
1101    return shader;
1102 }
1103 
1104 static unsigned
panvk_meta_copy_buf2img_format_idx(struct panvk_meta_copy_format_info key)1105 panvk_meta_copy_buf2img_format_idx(struct panvk_meta_copy_format_info key)
1106 {
1107    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_buf2img_fmts); i++) {
1108       if (!memcmp(&key, &panvk_meta_copy_buf2img_fmts[i], sizeof(key)))
1109          return i;
1110    }
1111 
1112    unreachable("Invalid image format\n");
1113 }
1114 
1115 static void
panvk_meta_copy_buf2img(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * buf,const struct panvk_image * img,const VkBufferImageCopy2 * region)1116 panvk_meta_copy_buf2img(struct panvk_cmd_buffer *cmdbuf,
1117                         const struct panvk_buffer *buf,
1118                         const struct panvk_image *img,
1119                         const VkBufferImageCopy2 *region)
1120 {
1121    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1122    struct pan_fb_info *fbinfo = &cmdbuf->state.fb.info;
1123    unsigned minx = MAX2(region->imageOffset.x, 0);
1124    unsigned miny = MAX2(region->imageOffset.y, 0);
1125    unsigned maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0);
1126    unsigned maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0);
1127 
1128    mali_ptr vpd =
1129       panvk_per_arch(meta_emit_viewport)(&cmdbuf->desc_pool.base,
1130                                          minx, miny, maxx, maxy);
1131 
1132    float dst_rect[] = {
1133       minx, miny, 0.0, 1.0,
1134       maxx + 1, miny, 0.0, 1.0,
1135       minx, maxy + 1, 0.0, 1.0,
1136       maxx + 1, maxy + 1, 0.0, 1.0,
1137    };
1138    mali_ptr dst_coords =
1139       pan_pool_upload_aligned(&cmdbuf->desc_pool.base, dst_rect,
1140                               sizeof(dst_rect), 64);
1141 
1142    struct panvk_meta_copy_format_info key = {
1143       .imgfmt = panvk_meta_copy_buf2img_format(img->pimage.layout.format),
1144       .mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
1145                                        region->imageSubresource.aspectMask),
1146    };
1147 
1148    unsigned fmtidx = panvk_meta_copy_buf2img_format_idx(key);
1149 
1150    mali_ptr rsd =
1151       cmdbuf->device->physical_device->meta.copy.buf2img[fmtidx].rsd;
1152    const struct panfrost_ubo_push *pushmap =
1153       &cmdbuf->device->physical_device->meta.copy.buf2img[fmtidx].pushmap;
1154 
1155    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1156    struct panvk_meta_copy_buf2img_info info = {
1157       .buf.ptr = buf->bo->ptr.gpu + buf->bo_offset + region->bufferOffset,
1158       .buf.stride.line = (region->bufferRowLength ? : region->imageExtent.width) * buftexelsz,
1159    };
1160 
1161    info.buf.stride.surf =
1162       (region->bufferImageHeight ? : region->imageExtent.height) * info.buf.stride.line;
1163 
1164    mali_ptr pushconsts =
1165       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
1166                                           &info, sizeof(info));
1167    mali_ptr ubo =
1168       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
1169 
1170    struct pan_image_view view = {
1171       .format = key.imgfmt,
1172       .dim = MALI_TEXTURE_DIMENSION_2D,
1173       .image = &img->pimage,
1174       .nr_samples = img->pimage.layout.nr_samples,
1175       .first_level = region->imageSubresource.mipLevel,
1176       .last_level = region->imageSubresource.mipLevel,
1177       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
1178    };
1179 
1180    /* TODO: don't force preloads of dst resources if unneeded */
1181    cmdbuf->state.fb.crc_valid[0] = false;
1182    *fbinfo = (struct pan_fb_info){
1183       .width = u_minify(img->pimage.layout.width, region->imageSubresource.mipLevel),
1184       .height = u_minify(img->pimage.layout.height, region->imageSubresource.mipLevel),
1185       .extent.minx = minx,
1186       .extent.maxx = maxx,
1187       .extent.miny = miny,
1188       .extent.maxy = maxy,
1189       .nr_samples = 1,
1190       .rt_count = 1,
1191       .rts[0].view = &view,
1192       .rts[0].preload = true,
1193       .rts[0].crc_valid = &cmdbuf->state.fb.crc_valid[0],
1194    };
1195 
1196    panvk_per_arch(cmd_close_batch)(cmdbuf);
1197 
1198    assert(region->imageSubresource.layerCount == 1 ||
1199           region->imageExtent.depth == 1);
1200    assert(region->imageOffset.z >= 0);
1201    unsigned first_layer = MAX2(region->imageSubresource.baseArrayLayer, region->imageOffset.z);
1202    unsigned nlayers = MAX2(region->imageSubresource.layerCount, region->imageExtent.depth);
1203    for (unsigned l = 0; l < nlayers; l++) {
1204       float src_rect[] = {
1205          0, 0, l, 1.0,
1206          region->imageExtent.width, 0, l, 1.0,
1207          0, region->imageExtent.height, l, 1.0,
1208          region->imageExtent.width, region->imageExtent.height, l, 1.0,
1209       };
1210 
1211       mali_ptr src_coords =
1212          pan_pool_upload_aligned(&cmdbuf->desc_pool.base, src_rect,
1213                                  sizeof(src_rect), 64);
1214 
1215       struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1216 
1217       view.first_layer = view.last_layer = l + first_layer;
1218       batch->blit.src = buf->bo;
1219       batch->blit.dst = img->pimage.data.bo;
1220       panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
1221       panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
1222       panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
1223 
1224       mali_ptr tsd, tiler;
1225 
1226 #if PAN_ARCH >= 6
1227       tsd = batch->tls.gpu;
1228       tiler = batch->tiler.descs.gpu;
1229 #else
1230       tsd = batch->fb.desc.gpu;
1231       tiler = 0;
1232 #endif
1233 
1234       struct panfrost_ptr job;
1235 
1236       job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base,
1237                                            &batch->scoreboard,
1238                                            src_coords, dst_coords,
1239                                            0, 0, ubo, pushconsts,
1240                                            vpd, rsd, tsd, tiler);
1241 
1242       util_dynarray_append(&batch->jobs, void *, job.cpu);
1243       panvk_per_arch(cmd_close_batch)(cmdbuf);
1244    }
1245 }
1246 
1247 static void
panvk_meta_copy_buf2img_init(struct panvk_physical_device * dev)1248 panvk_meta_copy_buf2img_init(struct panvk_physical_device *dev)
1249 {
1250    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_buf2img_fmts) == PANVK_META_COPY_BUF2IMG_NUM_FORMATS);
1251 
1252    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_buf2img_fmts); i++) {
1253       struct pan_shader_info shader_info;
1254       mali_ptr shader =
1255          panvk_meta_copy_buf2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
1256                                         panvk_meta_copy_buf2img_fmts[i],
1257                                         &shader_info);
1258       dev->meta.copy.buf2img[i].pushmap = shader_info.push;
1259       dev->meta.copy.buf2img[i].rsd =
1260          panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
1261                                          shader, &shader_info,
1262                                          panvk_meta_copy_buf2img_fmts[i].imgfmt,
1263                                          panvk_meta_copy_buf2img_fmts[i].mask,
1264                                          false);
1265    }
1266 }
1267 
1268 void
panvk_per_arch(CmdCopyBufferToImage2)1269 panvk_per_arch(CmdCopyBufferToImage2)(VkCommandBuffer commandBuffer,
1270                                       const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
1271 {
1272    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1273    VK_FROM_HANDLE(panvk_buffer, buf, pCopyBufferToImageInfo->srcBuffer);
1274    VK_FROM_HANDLE(panvk_image, img, pCopyBufferToImageInfo->dstImage);
1275 
1276    for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; i++) {
1277       panvk_meta_copy_buf2img(cmdbuf, buf, img, &pCopyBufferToImageInfo->pRegions[i]);
1278    }
1279 }
1280 
1281 static const struct panvk_meta_copy_format_info panvk_meta_copy_img2buf_fmts[] = {
1282    { PIPE_FORMAT_R8_UINT, 0x1 },
1283    { PIPE_FORMAT_R8G8_UINT, 0x3 },
1284    { PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
1285    { PIPE_FORMAT_R8G8B8A8_UINT, 0xf },
1286    { PIPE_FORMAT_R16G16B16_UINT, 0x7 },
1287    { PIPE_FORMAT_R32G32_UINT, 0x3 },
1288    { PIPE_FORMAT_R32G32B32_UINT, 0x7 },
1289    { PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
1290    /* S8 -> Z24S8 */
1291    { PIPE_FORMAT_R8G8B8A8_UINT, 0x8 },
1292    /* S8 -> Z32_S8X24 */
1293    { PIPE_FORMAT_R32G32_UINT, 0x2 },
1294    /* Z24X8 -> Z24S8 */
1295    { PIPE_FORMAT_R8G8B8A8_UINT, 0x7 },
1296    /* Z32 -> Z32_S8X24 */
1297    { PIPE_FORMAT_R32G32_UINT, 0x1 },
1298 };
1299 
1300 static enum pipe_format
panvk_meta_copy_img2buf_format(enum pipe_format imgfmt)1301 panvk_meta_copy_img2buf_format(enum pipe_format imgfmt)
1302 {
1303    /* Pick blendable formats when we can, and the FLOAT variant matching the
1304     * texelsize otherwise.
1305     */
1306    switch (util_format_get_blocksize(imgfmt)) {
1307    case 1: return PIPE_FORMAT_R8_UINT;
1308    /* AFBC stores things differently for RGB565,
1309     * we can't simply map to R8G8 in that case */
1310    case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
1311                    imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
1312                   PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UINT;
1313    case 4: return PIPE_FORMAT_R8G8B8A8_UINT;
1314    case 6: return PIPE_FORMAT_R16G16B16_UINT;
1315    case 8: return PIPE_FORMAT_R32G32_UINT;
1316    case 12: return PIPE_FORMAT_R32G32B32_UINT;
1317    case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
1318    default: unreachable("Invalid format\n");
1319    }
1320 }
1321 
1322 struct panvk_meta_copy_img2buf_info {
1323    struct {
1324       mali_ptr ptr;
1325       struct {
1326          unsigned line;
1327          unsigned surf;
1328       } stride;
1329    } buf;
1330    struct {
1331       struct {
1332          unsigned x, y, z;
1333       } offset;
1334       struct {
1335          unsigned minx, miny, maxx, maxy;
1336       } extent;
1337    } img;
1338 };
1339 
1340 #define panvk_meta_copy_img2buf_get_info_field(b, field) \
1341         nir_load_ubo((b), 1, \
1342                      sizeof(((struct panvk_meta_copy_img2buf_info *)0)->field) * 8, \
1343                      nir_imm_int(b, 0), \
1344                      nir_imm_int(b, offsetof(struct panvk_meta_copy_img2buf_info, field)), \
1345                      .align_mul = 4, \
1346                      .align_offset = 0, \
1347                      .range_base = 0, \
1348                      .range = ~0)
1349 
1350 static mali_ptr
panvk_meta_copy_img2buf_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct panvk_meta_copy_format_info key,unsigned texdim,unsigned texisarray,struct pan_shader_info * shader_info)1351 panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
1352                                struct pan_pool *bin_pool,
1353                                struct panvk_meta_copy_format_info key,
1354                                unsigned texdim, unsigned texisarray,
1355                                struct pan_shader_info *shader_info)
1356 {
1357    unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
1358    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1359 
1360    /* FIXME: Won't work on compute queues, but we can't do that with
1361     * a compute shader if the destination is an AFBC surface.
1362     */
1363    nir_builder b =
1364       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1365                                      GENX(pan_shader_get_compiler_options)(),
1366                                      "panvk_meta_copy_img2buf(dim=%dD%s,imgfmt=%s,mask=%x)",
1367                                      texdim, texisarray ? "[]" : "",
1368                                      util_format_name(key.imgfmt),
1369                                      key.mask);
1370 
1371    b.shader->info.num_ubos = 1;
1372 
1373    nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1374    nir_ssa_def *bufptr =
1375       panvk_meta_copy_img2buf_get_info_field(&b, buf.ptr);
1376    nir_ssa_def *buflinestride =
1377       panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.line);
1378    nir_ssa_def *bufsurfstride =
1379       panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.surf);
1380 
1381    nir_ssa_def *imgminx =
1382       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.minx);
1383    nir_ssa_def *imgminy =
1384       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.miny);
1385    nir_ssa_def *imgmaxx =
1386       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxx);
1387    nir_ssa_def *imgmaxy =
1388       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxy);
1389 
1390    nir_ssa_def *imgcoords, *inbounds;
1391 
1392    switch (texdim + texisarray) {
1393    case 1:
1394       imgcoords =
1395          nir_iadd(&b,
1396                   nir_channel(&b, coord, 0),
1397                   panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x));
1398       inbounds =
1399          nir_iand(&b,
1400                   nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1401                   nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx));
1402       break;
1403    case 2:
1404       imgcoords =
1405          nir_vec2(&b,
1406                   nir_iadd(&b,
1407                            nir_channel(&b, coord, 0),
1408                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
1409                   nir_iadd(&b,
1410                            nir_channel(&b, coord, 1),
1411                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
1412       inbounds =
1413          nir_iand(&b,
1414                   nir_iand(&b,
1415                            nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1416                            nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
1417                   nir_iand(&b,
1418                            nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
1419                            nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
1420       break;
1421    case 3:
1422       imgcoords =
1423          nir_vec3(&b,
1424                   nir_iadd(&b,
1425                            nir_channel(&b, coord, 0),
1426                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
1427                   nir_iadd(&b,
1428                            nir_channel(&b, coord, 1),
1429                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)),
1430                   nir_iadd(&b,
1431                            nir_channel(&b, coord, 2),
1432                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
1433       inbounds =
1434          nir_iand(&b,
1435                   nir_iand(&b,
1436                            nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1437                            nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
1438                   nir_iand(&b,
1439                            nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
1440                            nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
1441       break;
1442    default:
1443       unreachable("Invalid texture dimension\n");
1444    }
1445 
1446    nir_push_if(&b, inbounds);
1447 
1448    /* FIXME: doesn't work for tiled+compressed formats since blocks are 4x4
1449     * blocks instead of 16x16 texels in that case, and there's nothing we can
1450     * do to force the tile size to 4x4 in the render path.
1451     * This being said, compressed textures are not compatible with AFBC, so we
1452     * could use a compute shader arranging the blocks properly.
1453     */
1454    nir_ssa_def *offset =
1455       nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
1456    offset = nir_iadd(&b, offset,
1457                      nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
1458    offset = nir_iadd(&b, offset,
1459                      nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
1460    bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
1461 
1462    unsigned imgcompsz = imgtexelsz <= 4 ?
1463                         1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
1464    unsigned nimgcomps = imgtexelsz / imgcompsz;
1465    assert(nimgcomps <= 4);
1466 
1467    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
1468    tex->op = nir_texop_txf;
1469    tex->texture_index = 0;
1470    tex->is_array = texisarray;
1471    tex->dest_type = util_format_is_unorm(key.imgfmt) ?
1472                     nir_type_float32 : nir_type_uint32;
1473 
1474    switch (texdim) {
1475    case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
1476    case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
1477    case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
1478    default: unreachable("Invalid texture dimension");
1479    }
1480 
1481    tex->src[0].src_type = nir_tex_src_coord;
1482    tex->src[0].src = nir_src_for_ssa(imgcoords);
1483    tex->coord_components = texdim + texisarray;
1484    nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
1485                      nir_alu_type_get_type_size(tex->dest_type), NULL);
1486    nir_builder_instr_insert(&b, &tex->instr);
1487 
1488    nir_ssa_def *texel = &tex->dest.ssa;
1489 
1490    unsigned fullmask = (1 << util_format_get_nr_components(key.imgfmt)) - 1;
1491    unsigned nbufcomps = util_bitcount(fullmask);
1492    if (key.mask != fullmask) {
1493       nir_ssa_def *bufcomps[4];
1494       nbufcomps = 0;
1495       for (unsigned i = 0; i < nimgcomps; i++) {
1496          if (key.mask & BITFIELD_BIT(i))
1497             bufcomps[nbufcomps++] = nir_channel(&b, texel, i);
1498       }
1499 
1500       texel = nir_vec(&b, bufcomps, nbufcomps);
1501    }
1502 
1503    unsigned bufcompsz = buftexelsz / nbufcomps;
1504 
1505    if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
1506       texel = nir_fmul(&b, texel,
1507                        nir_vec3(&b,
1508                                 nir_imm_float(&b, 31),
1509                                 nir_imm_float(&b, 63),
1510                                 nir_imm_float(&b, 31)));
1511       texel = nir_f2u16(&b, texel);
1512       texel = nir_ior(&b, nir_channel(&b, texel, 0),
1513                       nir_ior(&b,
1514                               nir_ishl(&b, nir_channel(&b, texel, 1), nir_imm_int(&b, 5)),
1515                               nir_ishl(&b, nir_channel(&b, texel, 2), nir_imm_int(&b, 11))));
1516       imgcompsz = 2;
1517       bufcompsz = 2;
1518       nbufcomps = 1;
1519       nimgcomps = 1;
1520    } else if (imgcompsz == 1) {
1521       nir_ssa_def *packed = nir_channel(&b, texel, 0);
1522       for (unsigned i = 1; i < nbufcomps; i++) {
1523          packed = nir_ior(&b, packed,
1524                           nir_ishl(&b, nir_iand_imm(&b, nir_channel(&b, texel, i), 0xff),
1525                                    nir_imm_int(&b, i * 8)));
1526       }
1527       texel = packed;
1528 
1529       bufcompsz = nbufcomps == 3 ? 4 : nbufcomps;
1530       nbufcomps = 1;
1531    }
1532 
1533    assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
1534    assert(nbufcomps <= 4 && nimgcomps <= 4);
1535    texel = nir_u2uN(&b, texel, bufcompsz * 8);
1536 
1537    nir_store_global(&b, bufptr, bufcompsz, texel, (1 << nbufcomps) - 1);
1538    nir_pop_if(&b, NULL);
1539 
1540    struct panfrost_compile_inputs inputs = {
1541       .gpu_id = pdev->gpu_id,
1542       .is_blit = true,
1543    };
1544 
1545    struct util_dynarray binary;
1546 
1547    util_dynarray_init(&binary, NULL);
1548    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1549 
1550    /* Make sure UBO words have been upgraded to push constants and everything
1551     * is at the right place.
1552     */
1553    assert(shader_info->ubo_count == 1);
1554    assert(shader_info->push.count <= (sizeof(struct panvk_meta_copy_img2buf_info) / 4));
1555 
1556    mali_ptr shader =
1557       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1558                               PAN_ARCH >= 6 ? 128 : 64);
1559 
1560    util_dynarray_fini(&binary);
1561    ralloc_free(b.shader);
1562 
1563    return shader;
1564 }
1565 
1566 static unsigned
panvk_meta_copy_img2buf_format_idx(struct panvk_meta_copy_format_info key)1567 panvk_meta_copy_img2buf_format_idx(struct panvk_meta_copy_format_info key)
1568 {
1569    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
1570       if (!memcmp(&key, &panvk_meta_copy_img2buf_fmts[i], sizeof(key)))
1571          return i;
1572    }
1573 
1574    unreachable("Invalid texel size\n");
1575 }
1576 
1577 static void
panvk_meta_copy_img2buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * buf,const struct panvk_image * img,const VkBufferImageCopy2 * region)1578 panvk_meta_copy_img2buf(struct panvk_cmd_buffer *cmdbuf,
1579                         const struct panvk_buffer *buf,
1580                         const struct panvk_image *img,
1581                         const VkBufferImageCopy2 *region)
1582 {
1583    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1584    struct panvk_meta_copy_format_info key = {
1585       .imgfmt = panvk_meta_copy_img2buf_format(img->pimage.layout.format),
1586       .mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
1587                                        region->imageSubresource.aspectMask),
1588    };
1589    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1590    unsigned texdimidx =
1591       panvk_meta_copy_tex_type(img->pimage.layout.dim,
1592                                img->pimage.layout.array_size > 1);
1593    unsigned fmtidx = panvk_meta_copy_img2buf_format_idx(key);
1594 
1595    mali_ptr rsd =
1596       cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].rsd;
1597    const struct panfrost_ubo_push *pushmap =
1598       &cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].pushmap;
1599 
1600    struct panvk_meta_copy_img2buf_info info = {
1601       .buf.ptr = buf->bo->ptr.gpu + buf->bo_offset + region->bufferOffset,
1602       .buf.stride.line = (region->bufferRowLength ? : region->imageExtent.width) * buftexelsz,
1603       .img.offset.x = MAX2(region->imageOffset.x & ~15, 0),
1604       .img.extent.minx = MAX2(region->imageOffset.x, 0),
1605       .img.extent.maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0),
1606    };
1607 
1608    if (img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D) {
1609       info.img.extent.maxy = region->imageSubresource.layerCount - 1;
1610    } else {
1611       info.img.offset.y = MAX2(region->imageOffset.y & ~15, 0);
1612       info.img.offset.z = MAX2(region->imageOffset.z, 0);
1613       info.img.extent.miny = MAX2(region->imageOffset.y, 0);
1614       info.img.extent.maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0);
1615    }
1616 
1617    info.buf.stride.surf = (region->bufferImageHeight ? : region->imageExtent.height) *
1618                           info.buf.stride.line;
1619 
1620    mali_ptr pushconsts =
1621       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
1622                                           &info, sizeof(info));
1623    mali_ptr ubo =
1624       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
1625 
1626    struct pan_image_view view = {
1627       .format = key.imgfmt,
1628       .dim = img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
1629              MALI_TEXTURE_DIMENSION_2D : img->pimage.layout.dim,
1630       .image = &img->pimage,
1631       .nr_samples = img->pimage.layout.nr_samples,
1632       .first_level = region->imageSubresource.mipLevel,
1633       .last_level = region->imageSubresource.mipLevel,
1634       .first_layer = region->imageSubresource.baseArrayLayer,
1635       .last_layer = region->imageSubresource.baseArrayLayer + region->imageSubresource.layerCount - 1,
1636       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
1637    };
1638 
1639    mali_ptr texture =
1640       panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &view);
1641    mali_ptr sampler =
1642       panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
1643 
1644    panvk_per_arch(cmd_close_batch)(cmdbuf);
1645 
1646    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1647 
1648    struct pan_tls_info tlsinfo = { 0 };
1649 
1650    batch->blit.src = img->pimage.data.bo;
1651    batch->blit.dst = buf->bo;
1652    batch->tls =
1653       pan_pool_alloc_desc(&cmdbuf->desc_pool.base, LOCAL_STORAGE);
1654    GENX(pan_emit_tls)(&tlsinfo, batch->tls.cpu);
1655 
1656    mali_ptr tsd = batch->tls.gpu;
1657 
1658    struct pan_compute_dim wg_sz = {
1659       16,
1660       img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ? 1 : 16,
1661       1,
1662    };
1663 
1664    struct pan_compute_dim num_wg = {
1665      (ALIGN_POT(info.img.extent.maxx + 1, 16) - info.img.offset.x) / 16,
1666      img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ?
1667         region->imageSubresource.layerCount :
1668         (ALIGN_POT(info.img.extent.maxy + 1, 16) - info.img.offset.y) / 16,
1669      img->pimage.layout.dim != MALI_TEXTURE_DIMENSION_1D ?
1670         MAX2(region->imageSubresource.layerCount, region->imageExtent.depth) : 1,
1671    };
1672 
1673    struct panfrost_ptr job =
1674       panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1675                                        &batch->scoreboard, &num_wg, &wg_sz,
1676                                        texture, sampler,
1677                                        ubo, pushconsts,
1678                                        rsd, tsd);
1679 
1680    util_dynarray_append(&batch->jobs, void *, job.cpu);
1681 
1682    panvk_per_arch(cmd_close_batch)(cmdbuf);
1683 }
1684 
1685 static void
panvk_meta_copy_img2buf_init(struct panvk_physical_device * dev)1686 panvk_meta_copy_img2buf_init(struct panvk_physical_device *dev)
1687 {
1688    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2buf_fmts) == PANVK_META_COPY_IMG2BUF_NUM_FORMATS);
1689 
1690    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
1691       for (unsigned texdim = 1; texdim <= 3; texdim++) {
1692          unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
1693          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
1694 
1695          struct pan_shader_info shader_info;
1696          mali_ptr shader =
1697             panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1698                                            panvk_meta_copy_img2buf_fmts[i],
1699                                            texdim, false, &shader_info);
1700          dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
1701          dev->meta.copy.img2buf[texdimidx][i].rsd =
1702             panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
1703                                             &dev->meta.desc_pool.base,
1704                                             shader, &shader_info, true);
1705 
1706          if (texdim == 3)
1707             continue;
1708 
1709          memset(&shader_info, 0, sizeof(shader_info));
1710          texdimidx = panvk_meta_copy_tex_type(texdim, true);
1711          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
1712          shader =
1713             panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1714                                            panvk_meta_copy_img2buf_fmts[i],
1715                                            texdim, true, &shader_info);
1716          dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
1717          dev->meta.copy.img2buf[texdimidx][i].rsd =
1718             panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
1719                                             &dev->meta.desc_pool.base,
1720                                             shader, &shader_info, true);
1721       }
1722    }
1723 }
1724 
1725 void
panvk_per_arch(CmdCopyImageToBuffer2)1726 panvk_per_arch(CmdCopyImageToBuffer2)(VkCommandBuffer commandBuffer,
1727                                       const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
1728 {
1729    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1730    VK_FROM_HANDLE(panvk_buffer, buf, pCopyImageToBufferInfo->dstBuffer);
1731    VK_FROM_HANDLE(panvk_image, img, pCopyImageToBufferInfo->srcImage);
1732 
1733    for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; i++) {
1734       panvk_meta_copy_img2buf(cmdbuf, buf, img, &pCopyImageToBufferInfo->pRegions[i]);
1735    }
1736 }
1737 
1738 struct panvk_meta_copy_buf2buf_info {
1739    mali_ptr src;
1740    mali_ptr dst;
1741 };
1742 
1743 #define panvk_meta_copy_buf2buf_get_info_field(b, field) \
1744         nir_load_ubo((b), 1, \
1745                      sizeof(((struct panvk_meta_copy_buf2buf_info *)0)->field) * 8, \
1746                      nir_imm_int(b, 0), \
1747                      nir_imm_int(b, offsetof(struct panvk_meta_copy_buf2buf_info, field)), \
1748                      .align_mul = 4, \
1749                      .align_offset = 0, \
1750                      .range_base = 0, \
1751                      .range = ~0)
1752 
1753 static mali_ptr
panvk_meta_copy_buf2buf_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,unsigned blksz,struct pan_shader_info * shader_info)1754 panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev,
1755                                struct pan_pool *bin_pool,
1756                                unsigned blksz,
1757                                struct pan_shader_info *shader_info)
1758 {
1759    /* FIXME: Won't work on compute queues, but we can't do that with
1760     * a compute shader if the destination is an AFBC surface.
1761     */
1762    nir_builder b =
1763       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1764                                      GENX(pan_shader_get_compiler_options)(),
1765                                      "panvk_meta_copy_buf2buf(blksz=%d)",
1766                                      blksz);
1767 
1768    b.shader->info.num_ubos = 1;
1769 
1770    nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1771 
1772    nir_ssa_def *offset =
1773       nir_u2u64(&b, nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, blksz)));
1774    nir_ssa_def *srcptr =
1775       nir_iadd(&b, panvk_meta_copy_buf2buf_get_info_field(&b, src), offset);
1776    nir_ssa_def *dstptr =
1777       nir_iadd(&b, panvk_meta_copy_buf2buf_get_info_field(&b, dst), offset);
1778 
1779    unsigned compsz = blksz < 4 ? blksz : 4;
1780    unsigned ncomps = blksz / compsz;
1781    nir_store_global(&b, dstptr, blksz,
1782                     nir_load_global(&b, srcptr, blksz, ncomps, compsz * 8),
1783                     (1 << ncomps) - 1);
1784 
1785    struct panfrost_compile_inputs inputs = {
1786       .gpu_id = pdev->gpu_id,
1787       .is_blit = true,
1788    };
1789 
1790    struct util_dynarray binary;
1791 
1792    util_dynarray_init(&binary, NULL);
1793    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1794 
1795    /* Make sure UBO words have been upgraded to push constants and everything
1796     * is at the right place.
1797     */
1798    assert(shader_info->ubo_count == 1);
1799    assert(shader_info->push.count == (sizeof(struct panvk_meta_copy_buf2buf_info) / 4));
1800 
1801    mali_ptr shader =
1802       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1803                               PAN_ARCH >= 6 ? 128 : 64);
1804 
1805    util_dynarray_fini(&binary);
1806    ralloc_free(b.shader);
1807 
1808    return shader;
1809 }
1810 
1811 static void
panvk_meta_copy_buf2buf_init(struct panvk_physical_device * dev)1812 panvk_meta_copy_buf2buf_init(struct panvk_physical_device *dev)
1813 {
1814    for (unsigned i = 0; i < ARRAY_SIZE(dev->meta.copy.buf2buf); i++) {
1815       struct pan_shader_info shader_info;
1816       mali_ptr shader =
1817          panvk_meta_copy_buf2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1818                                         1 << i, &shader_info);
1819       dev->meta.copy.buf2buf[i].pushmap = shader_info.push;
1820       dev->meta.copy.buf2buf[i].rsd =
1821          panvk_meta_copy_to_buf_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
1822                                          shader, &shader_info, false);
1823    }
1824 }
1825 
1826 static void
panvk_meta_copy_buf2buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * src,const struct panvk_buffer * dst,const VkBufferCopy2 * region)1827 panvk_meta_copy_buf2buf(struct panvk_cmd_buffer *cmdbuf,
1828                         const struct panvk_buffer *src,
1829                         const struct panvk_buffer *dst,
1830                         const VkBufferCopy2 *region)
1831 {
1832    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1833 
1834    struct panvk_meta_copy_buf2buf_info info = {
1835       .src = src->bo->ptr.gpu + src->bo_offset + region->srcOffset,
1836       .dst = dst->bo->ptr.gpu + dst->bo_offset + region->dstOffset,
1837    };
1838 
1839    unsigned alignment = ffs((info.src | info.dst | region->size) & 15);
1840    unsigned log2blksz = alignment ? alignment - 1 : 4;
1841 
1842    assert(log2blksz < ARRAY_SIZE(cmdbuf->device->physical_device->meta.copy.buf2buf));
1843    mali_ptr rsd =
1844       cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].rsd;
1845    const struct panfrost_ubo_push *pushmap =
1846       &cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].pushmap;
1847 
1848    mali_ptr pushconsts =
1849       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
1850                                           &info, sizeof(info));
1851    mali_ptr ubo =
1852       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
1853 
1854    panvk_per_arch(cmd_close_batch)(cmdbuf);
1855 
1856    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1857 
1858    panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
1859 
1860    mali_ptr tsd = batch->tls.gpu;
1861 
1862    unsigned nblocks = region->size >> log2blksz;
1863    struct pan_compute_dim num_wg = { nblocks, 1, 1 };
1864    struct pan_compute_dim wg_sz = { 1, 1, 1};
1865    struct panfrost_ptr job =
1866      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1867                                       &batch->scoreboard,
1868                                       &num_wg, &wg_sz,
1869                                       0, 0, ubo, pushconsts, rsd, tsd);
1870 
1871    util_dynarray_append(&batch->jobs, void *, job.cpu);
1872 
1873    batch->blit.src = src->bo;
1874    batch->blit.dst = dst->bo;
1875    panvk_per_arch(cmd_close_batch)(cmdbuf);
1876 }
1877 
1878 void
panvk_per_arch(CmdCopyBuffer2)1879 panvk_per_arch(CmdCopyBuffer2)(VkCommandBuffer commandBuffer,
1880                                const VkCopyBufferInfo2 *pCopyBufferInfo)
1881 {
1882    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1883    VK_FROM_HANDLE(panvk_buffer, src, pCopyBufferInfo->srcBuffer);
1884    VK_FROM_HANDLE(panvk_buffer, dst, pCopyBufferInfo->dstBuffer);
1885 
1886    for (unsigned i = 0; i < pCopyBufferInfo->regionCount; i++) {
1887       panvk_meta_copy_buf2buf(cmdbuf, src, dst, &pCopyBufferInfo->pRegions[i]);
1888    }
1889 }
1890 
1891 struct panvk_meta_fill_buf_info {
1892    mali_ptr start;
1893    uint32_t val;
1894 };
1895 
1896 #define panvk_meta_fill_buf_get_info_field(b, field) \
1897         nir_load_ubo((b), 1, \
1898                      sizeof(((struct panvk_meta_fill_buf_info *)0)->field) * 8, \
1899                      nir_imm_int(b, 0), \
1900                      nir_imm_int(b, offsetof(struct panvk_meta_fill_buf_info, field)), \
1901                      .align_mul = 4, \
1902                      .align_offset = 0, \
1903                      .range_base = 0, \
1904                      .range = ~0)
1905 
1906 static mali_ptr
panvk_meta_fill_buf_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct pan_shader_info * shader_info)1907 panvk_meta_fill_buf_shader(struct panfrost_device *pdev,
1908                            struct pan_pool *bin_pool,
1909                            struct pan_shader_info *shader_info)
1910 {
1911    /* FIXME: Won't work on compute queues, but we can't do that with
1912     * a compute shader if the destination is an AFBC surface.
1913     */
1914    nir_builder b =
1915       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1916                                      GENX(pan_shader_get_compiler_options)(),
1917                                      "panvk_meta_fill_buf()");
1918 
1919    b.shader->info.num_ubos = 1;
1920 
1921    nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1922 
1923    nir_ssa_def *offset =
1924       nir_u2u64(&b, nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, sizeof(uint32_t))));
1925    nir_ssa_def *ptr =
1926       nir_iadd(&b, panvk_meta_fill_buf_get_info_field(&b, start), offset);
1927    nir_ssa_def *val = panvk_meta_fill_buf_get_info_field(&b, val);
1928 
1929    nir_store_global(&b, ptr, sizeof(uint32_t), val, 1);
1930 
1931    struct panfrost_compile_inputs inputs = {
1932       .gpu_id = pdev->gpu_id,
1933       .is_blit = true,
1934    };
1935 
1936    struct util_dynarray binary;
1937 
1938    util_dynarray_init(&binary, NULL);
1939    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1940 
1941    /* Make sure UBO words have been upgraded to push constants and everything
1942     * is at the right place.
1943     */
1944    assert(shader_info->ubo_count == 1);
1945    assert(shader_info->push.count == 3);
1946 
1947    mali_ptr shader =
1948       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1949                               PAN_ARCH >= 6 ? 128 : 64);
1950 
1951    util_dynarray_fini(&binary);
1952    ralloc_free(b.shader);
1953 
1954    return shader;
1955 }
1956 
1957 static mali_ptr
panvk_meta_fill_buf_emit_rsd(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct pan_pool * desc_pool,struct panfrost_ubo_push * pushmap)1958 panvk_meta_fill_buf_emit_rsd(struct panfrost_device *pdev,
1959                              struct pan_pool *bin_pool,
1960                              struct pan_pool *desc_pool,
1961                              struct panfrost_ubo_push *pushmap)
1962 {
1963    struct pan_shader_info shader_info;
1964 
1965    mali_ptr shader =
1966       panvk_meta_fill_buf_shader(pdev, bin_pool, &shader_info);
1967 
1968    struct panfrost_ptr rsd_ptr =
1969       pan_pool_alloc_desc_aggregate(desc_pool,
1970                                     PAN_DESC(RENDERER_STATE));
1971 
1972    pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
1973       pan_shader_prepare_rsd(&shader_info, shader, &cfg);
1974    }
1975 
1976    *pushmap = shader_info.push;
1977    return rsd_ptr.gpu;
1978 }
1979 
1980 static void
panvk_meta_fill_buf_init(struct panvk_physical_device * dev)1981 panvk_meta_fill_buf_init(struct panvk_physical_device *dev)
1982 {
1983    dev->meta.copy.fillbuf.rsd =
1984       panvk_meta_fill_buf_emit_rsd(&dev->pdev, &dev->meta.bin_pool.base,
1985                                    &dev->meta.desc_pool.base,
1986                                    &dev->meta.copy.fillbuf.pushmap);
1987 }
1988 
1989 static void
panvk_meta_fill_buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * dst,VkDeviceSize size,VkDeviceSize offset,uint32_t val)1990 panvk_meta_fill_buf(struct panvk_cmd_buffer *cmdbuf,
1991                     const struct panvk_buffer *dst,
1992                     VkDeviceSize size, VkDeviceSize offset,
1993                     uint32_t val)
1994 {
1995    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1996 
1997    if (size == VK_WHOLE_SIZE)
1998       size = (dst->size - offset) & ~3ULL;
1999 
2000    struct panvk_meta_fill_buf_info info = {
2001       .start = dst->bo->ptr.gpu + dst->bo_offset + offset,
2002       .val = val,
2003    };
2004 
2005    assert(!(offset & 3) && !(size & 3));
2006 
2007    unsigned nwords = size / sizeof(uint32_t);
2008    mali_ptr rsd =
2009       cmdbuf->device->physical_device->meta.copy.fillbuf.rsd;
2010    const struct panfrost_ubo_push *pushmap =
2011       &cmdbuf->device->physical_device->meta.copy.fillbuf.pushmap;
2012 
2013    mali_ptr pushconsts =
2014       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
2015                                           &info, sizeof(info));
2016    mali_ptr ubo =
2017       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
2018 
2019    panvk_per_arch(cmd_close_batch)(cmdbuf);
2020 
2021    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
2022 
2023    panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
2024 
2025    mali_ptr tsd = batch->tls.gpu;
2026 
2027    struct pan_compute_dim num_wg = { nwords, 1, 1 };
2028    struct pan_compute_dim wg_sz = { 1, 1, 1};
2029    struct panfrost_ptr job =
2030      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
2031                                       &batch->scoreboard,
2032                                       &num_wg, &wg_sz,
2033                                       0, 0, ubo, pushconsts, rsd, tsd);
2034 
2035    util_dynarray_append(&batch->jobs, void *, job.cpu);
2036 
2037    batch->blit.dst = dst->bo;
2038    panvk_per_arch(cmd_close_batch)(cmdbuf);
2039 }
2040 
2041 void
panvk_per_arch(CmdFillBuffer)2042 panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer,
2043                               VkBuffer dstBuffer,
2044                               VkDeviceSize dstOffset,
2045                               VkDeviceSize fillSize,
2046                               uint32_t data)
2047 {
2048    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2049    VK_FROM_HANDLE(panvk_buffer, dst, dstBuffer);
2050 
2051    panvk_meta_fill_buf(cmdbuf, dst, fillSize, dstOffset, data);
2052 }
2053 
2054 static void
panvk_meta_update_buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * dst,VkDeviceSize offset,VkDeviceSize size,const void * data)2055 panvk_meta_update_buf(struct panvk_cmd_buffer *cmdbuf,
2056                       const struct panvk_buffer *dst, VkDeviceSize offset,
2057                       VkDeviceSize size, const void *data)
2058 {
2059    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
2060 
2061    struct panvk_meta_copy_buf2buf_info info = {
2062       .src = pan_pool_upload_aligned(&cmdbuf->desc_pool.base, data, size, 4),
2063       .dst = dst->bo->ptr.gpu + dst->bo_offset + offset,
2064    };
2065 
2066    unsigned log2blksz = ffs(sizeof(uint32_t)) - 1;
2067 
2068    mali_ptr rsd =
2069       cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].rsd;
2070    const struct panfrost_ubo_push *pushmap =
2071       &cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].pushmap;
2072 
2073    mali_ptr pushconsts =
2074       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
2075                                           &info, sizeof(info));
2076    mali_ptr ubo =
2077       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
2078 
2079    panvk_per_arch(cmd_close_batch)(cmdbuf);
2080 
2081    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
2082 
2083    panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
2084 
2085    mali_ptr tsd = batch->tls.gpu;
2086 
2087    unsigned nblocks = size >> log2blksz;
2088    struct pan_compute_dim num_wg = { nblocks, 1, 1 };
2089    struct pan_compute_dim wg_sz = { 1, 1, 1};
2090    struct panfrost_ptr job =
2091      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
2092                                       &batch->scoreboard,
2093                                       &num_wg, &wg_sz,
2094                                       0, 0, ubo, pushconsts, rsd, tsd);
2095 
2096    util_dynarray_append(&batch->jobs, void *, job.cpu);
2097 
2098    batch->blit.dst = dst->bo;
2099    panvk_per_arch(cmd_close_batch)(cmdbuf);
2100 }
2101 
2102 void
panvk_per_arch(CmdUpdateBuffer)2103 panvk_per_arch(CmdUpdateBuffer)(VkCommandBuffer commandBuffer,
2104                                 VkBuffer dstBuffer,
2105                                 VkDeviceSize dstOffset,
2106                                 VkDeviceSize dataSize,
2107                                 const void *pData)
2108 {
2109    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2110    VK_FROM_HANDLE(panvk_buffer, dst, dstBuffer);
2111 
2112    panvk_meta_update_buf(cmdbuf, dst, dstOffset, dataSize, pData);
2113 }
2114 
2115 void
panvk_per_arch(meta_copy_init)2116 panvk_per_arch(meta_copy_init)(struct panvk_physical_device *dev)
2117 {
2118    panvk_meta_copy_img2img_init(dev, false);
2119    panvk_meta_copy_img2img_init(dev, true);
2120    panvk_meta_copy_buf2img_init(dev);
2121    panvk_meta_copy_img2buf_init(dev);
2122    panvk_meta_copy_buf2buf_init(dev);
2123    panvk_meta_fill_buf_init(dev);
2124 }
2125