1 /*
2  * Copyright © 2021 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "gen_macros.h"
25 
26 #include "nir/nir_builder.h"
27 #include "pan_encoder.h"
28 #include "pan_shader.h"
29 
30 #include "panvk_private.h"
31 
32 static mali_ptr
panvk_meta_copy_img_emit_texture(struct panfrost_device * pdev,struct pan_pool * desc_pool,const struct pan_image_view * view)33 panvk_meta_copy_img_emit_texture(struct panfrost_device *pdev,
34                                  struct pan_pool *desc_pool,
35                                  const struct pan_image_view *view)
36 {
37 #if PAN_ARCH >= 6
38    struct panfrost_ptr texture =
39       pan_pool_alloc_desc(desc_pool, TEXTURE);
40    size_t payload_size =
41       GENX(panfrost_estimate_texture_payload_size)(view);
42    struct panfrost_ptr surfaces =
43       pan_pool_alloc_aligned(desc_pool, payload_size,
44                              pan_alignment(SURFACE_WITH_STRIDE));
45 
46    GENX(panfrost_new_texture)(pdev, view, texture.cpu, &surfaces);
47 
48    return texture.gpu;
49 #else
50    size_t sz = pan_size(TEXTURE) +
51                GENX(panfrost_estimate_texture_payload_size)(view);
52    struct panfrost_ptr texture =
53       pan_pool_alloc_aligned(desc_pool, sz, pan_alignment(TEXTURE));
54    struct panfrost_ptr surfaces = {
55       .cpu = texture.cpu + pan_size(TEXTURE),
56       .gpu = texture.gpu + pan_size(TEXTURE),
57    };
58 
59    GENX(panfrost_new_texture)(pdev, view, texture.cpu, &surfaces);
60 
61    return pan_pool_upload_aligned(desc_pool, &texture.gpu,
62                                   sizeof(mali_ptr),
63                                   sizeof(mali_ptr));
64 #endif
65 }
66 
67 static mali_ptr
panvk_meta_copy_img_emit_sampler(struct panfrost_device * pdev,struct pan_pool * desc_pool)68 panvk_meta_copy_img_emit_sampler(struct panfrost_device *pdev,
69                                  struct pan_pool *desc_pool)
70 {
71    struct panfrost_ptr sampler =
72       pan_pool_alloc_desc(desc_pool, SAMPLER);
73 
74    pan_pack(sampler.cpu, SAMPLER, cfg) {
75 #if PAN_ARCH >= 6
76       cfg.seamless_cube_map = false;
77 #endif
78       cfg.normalized_coordinates = false;
79       cfg.minify_nearest = true;
80       cfg.magnify_nearest = true;
81    }
82 
83    return sampler.gpu;
84 }
85 
86 static void
panvk_meta_copy_emit_varying(struct pan_pool * pool,mali_ptr coordinates,mali_ptr * varying_bufs,mali_ptr * varyings)87 panvk_meta_copy_emit_varying(struct pan_pool *pool,
88                              mali_ptr coordinates,
89                              mali_ptr *varying_bufs,
90                              mali_ptr *varyings)
91 {
92    /* Bifrost needs an empty desc to mark end of prefetching */
93    bool padding_buffer = PAN_ARCH >= 6;
94 
95    struct panfrost_ptr varying =
96       pan_pool_alloc_desc(pool, ATTRIBUTE);
97    struct panfrost_ptr varying_buffer =
98       pan_pool_alloc_desc_array(pool, (padding_buffer ? 2 : 1),
99                                      ATTRIBUTE_BUFFER);
100 
101    pan_pack(varying_buffer.cpu, ATTRIBUTE_BUFFER, cfg) {
102       cfg.pointer = coordinates;
103       cfg.stride = 4 * sizeof(uint32_t);
104       cfg.size = cfg.stride * 4;
105    }
106 
107    if (padding_buffer) {
108       pan_pack(varying_buffer.cpu + pan_size(ATTRIBUTE_BUFFER),
109                ATTRIBUTE_BUFFER, cfg);
110    }
111 
112    pan_pack(varying.cpu, ATTRIBUTE, cfg) {
113       cfg.buffer_index = 0;
114       cfg.offset_enable = PAN_ARCH <= 5;
115       cfg.format = pool->dev->formats[PIPE_FORMAT_R32G32B32_FLOAT].hw;
116    }
117 
118    *varyings = varying.gpu;
119    *varying_bufs = varying_buffer.gpu;
120 }
121 
122 static void
panvk_meta_copy_emit_dcd(struct pan_pool * pool,mali_ptr src_coords,mali_ptr dst_coords,mali_ptr texture,mali_ptr sampler,mali_ptr vpd,mali_ptr tsd,mali_ptr rsd,mali_ptr ubos,mali_ptr push_constants,void * out)123 panvk_meta_copy_emit_dcd(struct pan_pool *pool,
124                          mali_ptr src_coords, mali_ptr dst_coords,
125                          mali_ptr texture, mali_ptr sampler,
126                          mali_ptr vpd, mali_ptr tsd, mali_ptr rsd,
127                          mali_ptr ubos, mali_ptr push_constants,
128                          void *out)
129 {
130    pan_pack(out, DRAW, cfg) {
131       cfg.four_components_per_vertex = true;
132       cfg.draw_descriptor_is_64b = true;
133       cfg.thread_storage = tsd;
134       cfg.state = rsd;
135       cfg.uniform_buffers = ubos;
136       cfg.push_uniforms = push_constants;
137       cfg.position = dst_coords;
138       if (src_coords) {
139               panvk_meta_copy_emit_varying(pool, src_coords,
140                                            &cfg.varying_buffers,
141                                            &cfg.varyings);
142       }
143       cfg.viewport = vpd;
144       cfg.textures = texture;
145       cfg.samplers = sampler;
146    }
147 }
148 
149 static struct panfrost_ptr
panvk_meta_copy_emit_tiler_job(struct pan_pool * desc_pool,struct pan_scoreboard * scoreboard,mali_ptr src_coords,mali_ptr dst_coords,mali_ptr texture,mali_ptr sampler,mali_ptr ubo,mali_ptr push_constants,mali_ptr vpd,mali_ptr rsd,mali_ptr tsd,mali_ptr tiler)150 panvk_meta_copy_emit_tiler_job(struct pan_pool *desc_pool,
151                                struct pan_scoreboard *scoreboard,
152                                mali_ptr src_coords, mali_ptr dst_coords,
153                                mali_ptr texture, mali_ptr sampler,
154                                mali_ptr ubo, mali_ptr push_constants,
155                                mali_ptr vpd, mali_ptr rsd,
156                                mali_ptr tsd, mali_ptr tiler)
157 {
158    struct panfrost_ptr job =
159       pan_pool_alloc_desc(desc_pool, TILER_JOB);
160 
161    panvk_meta_copy_emit_dcd(desc_pool, src_coords, dst_coords,
162                             texture, sampler, vpd, tsd, rsd, ubo, push_constants,
163                             pan_section_ptr(job.cpu, TILER_JOB, DRAW));
164 
165    pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE, cfg) {
166       cfg.draw_mode = MALI_DRAW_MODE_TRIANGLE_STRIP;
167       cfg.index_count = 4;
168       cfg.job_task_split = 6;
169    }
170 
171    pan_section_pack(job.cpu, TILER_JOB, PRIMITIVE_SIZE, cfg) {
172       cfg.constant = 1.0f;
173    }
174 
175    void *invoc = pan_section_ptr(job.cpu,
176                                  TILER_JOB,
177                                  INVOCATION);
178    panfrost_pack_work_groups_compute(invoc, 1, 4,
179                                      1, 1, 1, 1, true, false);
180 
181 #if PAN_ARCH >= 6
182    pan_section_pack(job.cpu, TILER_JOB, PADDING, cfg);
183    pan_section_pack(job.cpu, TILER_JOB, TILER, cfg) {
184       cfg.address = tiler;
185    }
186 #endif
187 
188    panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_TILER,
189                     false, false, 0, 0, &job, false);
190    return job;
191 }
192 
193 static struct panfrost_ptr
panvk_meta_copy_emit_compute_job(struct pan_pool * desc_pool,struct pan_scoreboard * scoreboard,const struct pan_compute_dim * num_wg,const struct pan_compute_dim * wg_sz,mali_ptr texture,mali_ptr sampler,mali_ptr ubo,mali_ptr push_constants,mali_ptr rsd,mali_ptr tsd)194 panvk_meta_copy_emit_compute_job(struct pan_pool *desc_pool,
195                                  struct pan_scoreboard *scoreboard,
196                                  const struct pan_compute_dim *num_wg,
197                                  const struct pan_compute_dim *wg_sz,
198                                  mali_ptr texture, mali_ptr sampler,
199                                  mali_ptr ubo, mali_ptr push_constants,
200                                  mali_ptr rsd, mali_ptr tsd)
201 {
202    struct panfrost_ptr job =
203       pan_pool_alloc_desc(desc_pool, COMPUTE_JOB);
204 
205    void *invoc = pan_section_ptr(job.cpu,
206                                  COMPUTE_JOB,
207                                  INVOCATION);
208    panfrost_pack_work_groups_compute(invoc, num_wg->x, num_wg->y, num_wg->z,
209                                      wg_sz->x, wg_sz->y, wg_sz->z,
210                                      false, false);
211 
212    pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
213       cfg.job_task_split = 8;
214    }
215 
216    panvk_meta_copy_emit_dcd(desc_pool, 0, 0, texture, sampler,
217                             0, tsd, rsd, ubo, push_constants,
218                             pan_section_ptr(job.cpu, COMPUTE_JOB, DRAW));
219 
220    panfrost_add_job(desc_pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
221                     false, false, 0, 0, &job, false);
222    return job;
223 }
224 
225 
226 #if PAN_ARCH >= 6
227 static uint32_t
panvk_meta_copy_img_bifrost_raw_format(unsigned texelsize)228 panvk_meta_copy_img_bifrost_raw_format(unsigned texelsize)
229 {
230    switch (texelsize) {
231    case 6: return MALI_RGB16UI << 12;
232    case 8: return MALI_RG32UI << 12;
233    case 12: return MALI_RGB32UI << 12;
234    case 16: return MALI_RGBA32UI << 12;
235    default: unreachable("Invalid texel size\n");
236    }
237 }
238 #endif
239 
240 static mali_ptr
panvk_meta_copy_to_img_emit_rsd(struct panfrost_device * pdev,struct pan_pool * desc_pool,mali_ptr shader,const struct pan_shader_info * shader_info,enum pipe_format fmt,unsigned wrmask,bool from_img)241 panvk_meta_copy_to_img_emit_rsd(struct panfrost_device *pdev,
242                                 struct pan_pool *desc_pool,
243                                 mali_ptr shader,
244                                 const struct pan_shader_info *shader_info,
245                                 enum pipe_format fmt, unsigned wrmask,
246                                 bool from_img)
247 {
248    struct panfrost_ptr rsd_ptr =
249       pan_pool_alloc_desc_aggregate(desc_pool,
250                                     PAN_DESC(RENDERER_STATE),
251                                     PAN_DESC_ARRAY(1, BLEND));
252 
253    bool raw = util_format_get_blocksize(fmt) > 4;
254    unsigned fullmask = (1 << util_format_get_nr_components(fmt)) - 1;
255    bool partialwrite = fullmask != wrmask && !raw;
256    bool readstb = fullmask != wrmask && raw;
257 
258    pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
259       pan_shader_prepare_rsd(shader_info, shader, &cfg);
260       if (from_img) {
261          cfg.shader.varying_count = 1;
262          cfg.shader.texture_count = 1;
263          cfg.shader.sampler_count = 1;
264       }
265       cfg.properties.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
266       cfg.multisample_misc.sample_mask = UINT16_MAX;
267       cfg.multisample_misc.depth_function = MALI_FUNC_ALWAYS;
268       cfg.stencil_mask_misc.stencil_mask_front = 0xFF;
269       cfg.stencil_mask_misc.stencil_mask_back = 0xFF;
270       cfg.stencil_front.compare_function = MALI_FUNC_ALWAYS;
271       cfg.stencil_front.stencil_fail = MALI_STENCIL_OP_REPLACE;
272       cfg.stencil_front.depth_fail = MALI_STENCIL_OP_REPLACE;
273       cfg.stencil_front.depth_pass = MALI_STENCIL_OP_REPLACE;
274       cfg.stencil_front.mask = 0xFF;
275       cfg.stencil_back = cfg.stencil_front;
276 
277 #if PAN_ARCH >= 6
278       cfg.properties.allow_forward_pixel_to_be_killed = true;
279       cfg.properties.allow_forward_pixel_to_kill =
280          !partialwrite && !readstb;
281       cfg.properties.zs_update_operation =
282          MALI_PIXEL_KILL_STRONG_EARLY;
283       cfg.properties.pixel_kill_operation =
284          MALI_PIXEL_KILL_FORCE_EARLY;
285 #else
286       cfg.properties.shader_reads_tilebuffer = readstb;
287       cfg.properties.work_register_count = shader_info->work_reg_count;
288       cfg.properties.force_early_z = true;
289       cfg.stencil_mask_misc.alpha_test_compare_function = MALI_FUNC_ALWAYS;
290 #endif
291    }
292 
293    pan_pack(rsd_ptr.cpu + pan_size(RENDERER_STATE), BLEND, cfg) {
294       cfg.round_to_fb_precision = true;
295       cfg.load_destination = partialwrite;
296       cfg.equation.rgb.a = MALI_BLEND_OPERAND_A_SRC;
297       cfg.equation.rgb.b = MALI_BLEND_OPERAND_B_SRC;
298       cfg.equation.rgb.c = MALI_BLEND_OPERAND_C_ZERO;
299       cfg.equation.alpha.a = MALI_BLEND_OPERAND_A_SRC;
300       cfg.equation.alpha.b = MALI_BLEND_OPERAND_B_SRC;
301       cfg.equation.alpha.c = MALI_BLEND_OPERAND_C_ZERO;
302 #if PAN_ARCH >= 6
303       cfg.internal.mode =
304          partialwrite ?
305          MALI_BLEND_MODE_FIXED_FUNCTION :
306          MALI_BLEND_MODE_OPAQUE;
307       cfg.equation.color_mask = partialwrite ? wrmask : 0xf;
308       cfg.internal.fixed_function.num_comps = 4;
309       if (!raw) {
310          cfg.internal.fixed_function.conversion.memory_format =
311             panfrost_format_to_bifrost_blend(pdev, fmt, false);
312          cfg.internal.fixed_function.conversion.register_format =
313             MALI_REGISTER_FILE_FORMAT_F32;
314       } else {
315          unsigned imgtexelsz = util_format_get_blocksize(fmt);
316 
317          cfg.internal.fixed_function.conversion.memory_format =
318             panvk_meta_copy_img_bifrost_raw_format(imgtexelsz);
319          cfg.internal.fixed_function.conversion.register_format =
320             (imgtexelsz & 2) ?
321             MALI_REGISTER_FILE_FORMAT_U16 :
322             MALI_REGISTER_FILE_FORMAT_U32;
323       }
324 #else
325       cfg.equation.color_mask = wrmask;
326 #endif
327    }
328 
329    return rsd_ptr.gpu;
330 }
331 
332 static mali_ptr
panvk_meta_copy_emit_ubo(struct panfrost_device * pdev,struct pan_pool * pool,void * data,unsigned size)333 panvk_meta_copy_emit_ubo(struct panfrost_device *pdev,
334                          struct pan_pool *pool,
335                          void *data, unsigned size)
336 {
337    struct panfrost_ptr ubo = pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
338 
339    pan_pack(ubo.cpu, UNIFORM_BUFFER, cfg) {
340       cfg.entries = DIV_ROUND_UP(size, 16);
341       cfg.pointer = pan_pool_upload_aligned(pool, data, size, 16);
342    }
343 
344    return ubo.gpu;
345 }
346 
347 static mali_ptr
panvk_meta_copy_emit_push_constants(struct panfrost_device * pdev,const struct panfrost_ubo_push * pushmap,struct pan_pool * pool,const void * data,unsigned size)348 panvk_meta_copy_emit_push_constants(struct panfrost_device *pdev,
349                                     const struct panfrost_ubo_push *pushmap,
350                                     struct pan_pool *pool,
351                                     const void *data, unsigned size)
352 {
353    assert(pushmap->count <= (size / 4));
354 
355    const uint32_t *in = data;
356    uint32_t pushvals[PAN_MAX_PUSH];
357 
358    for (unsigned i = 0; i < pushmap->count; i++) {
359       assert(i < ARRAY_SIZE(pushvals));
360       assert(pushmap->words[i].ubo == 0);
361       assert(pushmap->words[i].offset < size);
362       pushvals[i] = in[pushmap->words[i].offset / 4];
363    }
364 
365    return pan_pool_upload_aligned(pool, pushvals, size, 16);
366 }
367 
368 static mali_ptr
panvk_meta_copy_to_buf_emit_rsd(struct panfrost_device * pdev,struct pan_pool * desc_pool,mali_ptr shader,const struct pan_shader_info * shader_info,bool from_img)369 panvk_meta_copy_to_buf_emit_rsd(struct panfrost_device *pdev,
370                                 struct pan_pool *desc_pool,
371                                 mali_ptr shader,
372                                 const struct pan_shader_info *shader_info,
373                                 bool from_img)
374 {
375    struct panfrost_ptr rsd_ptr =
376       pan_pool_alloc_desc_aggregate(desc_pool,
377                                     PAN_DESC(RENDERER_STATE));
378 
379    pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
380       pan_shader_prepare_rsd(shader_info, shader, &cfg);
381       if (from_img) {
382          cfg.shader.texture_count = 1;
383          cfg.shader.sampler_count = 1;
384       }
385    }
386 
387    return rsd_ptr.gpu;
388 }
389 
390 static mali_ptr
panvk_meta_copy_img2img_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,enum pipe_format srcfmt,enum pipe_format dstfmt,unsigned dstmask,unsigned texdim,bool texisarray,bool is_ms,struct pan_shader_info * shader_info)391 panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
392                                struct pan_pool *bin_pool,
393                                enum pipe_format srcfmt,
394                                enum pipe_format dstfmt, unsigned dstmask,
395                                unsigned texdim, bool texisarray, bool is_ms,
396                                struct pan_shader_info *shader_info)
397 {
398    nir_builder b =
399       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
400                                      GENX(pan_shader_get_compiler_options)(),
401                                      "panvk_meta_copy_img2img(srcfmt=%s,dstfmt=%s,%dD%s%s)",
402                                      util_format_name(srcfmt), util_format_name(dstfmt),
403                                      texdim, texisarray ? "[]" : "", is_ms ? ",ms" : "");
404 
405    b.shader->info.internal = true;
406 
407    nir_variable *coord_var =
408       nir_variable_create(b.shader, nir_var_shader_in,
409                           glsl_vector_type(GLSL_TYPE_FLOAT, texdim + texisarray),
410                           "coord");
411    coord_var->data.location = VARYING_SLOT_TEX0;
412    nir_ssa_def *coord = nir_f2u32(&b, nir_load_var(&b, coord_var));
413 
414    nir_tex_instr *tex = nir_tex_instr_create(b.shader, is_ms ? 2 : 1);
415    tex->op = is_ms ? nir_texop_txf_ms : nir_texop_txf;
416    tex->texture_index = 0;
417    tex->is_array = texisarray;
418    tex->dest_type = util_format_is_unorm(srcfmt) ?
419                     nir_type_float32 : nir_type_uint32;
420 
421    switch (texdim) {
422    case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
423    case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
424    case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
425    default: unreachable("Invalid texture dimension");
426    }
427 
428    tex->src[0].src_type = nir_tex_src_coord;
429    tex->src[0].src = nir_src_for_ssa(coord);
430    tex->coord_components = texdim + texisarray;
431 
432    if (is_ms) {
433       tex->src[1].src_type = nir_tex_src_ms_index;
434       tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(&b));
435    }
436 
437    nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
438                      nir_alu_type_get_type_size(tex->dest_type), NULL);
439    nir_builder_instr_insert(&b, &tex->instr);
440 
441    nir_ssa_def *texel = &tex->dest.ssa;
442 
443    unsigned dstcompsz =
444       util_format_get_component_bits(dstfmt, UTIL_FORMAT_COLORSPACE_RGB, 0);
445    unsigned ndstcomps = util_format_get_nr_components(dstfmt);
446    const struct glsl_type *outtype = NULL;
447 
448    if (srcfmt == PIPE_FORMAT_R5G6B5_UNORM && dstfmt == PIPE_FORMAT_R8G8_UNORM) {
449       nir_ssa_def *rgb =
450          nir_f2u32(&b, nir_fmul(&b, texel,
451                                 nir_vec3(&b,
452                                          nir_imm_float(&b, 31),
453                                          nir_imm_float(&b, 63),
454                                          nir_imm_float(&b, 31))));
455       nir_ssa_def *rg =
456          nir_vec2(&b,
457                   nir_ior(&b, nir_channel(&b, rgb, 0),
458                           nir_ishl(&b, nir_channel(&b, rgb, 1),
459                                    nir_imm_int(&b, 5))),
460                   nir_ior(&b,
461                           nir_ushr_imm(&b, nir_channel(&b, rgb, 1), 3),
462                           nir_ishl(&b, nir_channel(&b, rgb, 2),
463                                    nir_imm_int(&b, 3))));
464       rg = nir_iand_imm(&b, rg, 255);
465       texel = nir_fmul_imm(&b, nir_u2f32(&b, rg), 1.0 / 255);
466       outtype = glsl_vector_type(GLSL_TYPE_FLOAT, 2);
467    } else if (srcfmt == PIPE_FORMAT_R8G8_UNORM && dstfmt == PIPE_FORMAT_R5G6B5_UNORM) {
468       nir_ssa_def *rg = nir_f2u32(&b, nir_fmul_imm(&b, texel, 255));
469       nir_ssa_def *rgb =
470          nir_vec3(&b,
471                   nir_channel(&b, rg, 0),
472                   nir_ior(&b,
473                           nir_ushr_imm(&b, nir_channel(&b, rg, 0), 5),
474                           nir_ishl(&b, nir_channel(&b, rg, 1),
475                                    nir_imm_int(&b, 3))),
476                   nir_ushr_imm(&b, nir_channel(&b, rg, 1), 3));
477       rgb = nir_iand(&b, rgb,
478                      nir_vec3(&b,
479                               nir_imm_int(&b, 31),
480                               nir_imm_int(&b, 63),
481                               nir_imm_int(&b, 31)));
482       texel = nir_fmul(&b, nir_u2f32(&b, rgb),
483                        nir_vec3(&b,
484                                 nir_imm_float(&b, 1.0 / 31),
485                                 nir_imm_float(&b, 1.0 / 63),
486                                 nir_imm_float(&b, 1.0 / 31)));
487       outtype = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
488    } else {
489       assert(srcfmt == dstfmt);
490       enum glsl_base_type basetype;
491       if (util_format_is_unorm(dstfmt)) {
492          basetype = GLSL_TYPE_FLOAT;
493       } else if (dstcompsz == 16) {
494          basetype = GLSL_TYPE_UINT16;
495       } else {
496          assert(dstcompsz == 32);
497          basetype = GLSL_TYPE_UINT;
498       }
499 
500       if (dstcompsz == 16)
501          texel = nir_u2u16(&b, texel);
502 
503       texel = nir_channels(&b, texel, (1 << ndstcomps) - 1);
504       outtype = glsl_vector_type(basetype, ndstcomps);
505    }
506 
507    nir_variable *out =
508       nir_variable_create(b.shader, nir_var_shader_out, outtype, "out");
509    out->data.location = FRAG_RESULT_DATA0;
510 
511    unsigned fullmask = (1 << ndstcomps) - 1;
512    if (dstcompsz > 8 && dstmask != fullmask) {
513       nir_ssa_def *oldtexel = nir_load_var(&b, out);
514       nir_ssa_def *dstcomps[4];
515 
516       for (unsigned i = 0; i < ndstcomps; i++) {
517          if (dstmask & BITFIELD_BIT(i))
518             dstcomps[i] = nir_channel(&b, texel, i);
519          else
520             dstcomps[i] = nir_channel(&b, oldtexel, i);
521       }
522 
523       texel = nir_vec(&b, dstcomps, ndstcomps);
524    }
525 
526    nir_store_var(&b, out, texel, 0xff);
527 
528    struct panfrost_compile_inputs inputs = {
529       .gpu_id = pdev->gpu_id,
530       .is_blit = true,
531    };
532 
533 #if PAN_ARCH >= 6
534    pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
535       cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
536       cfg.register_format = dstcompsz == 2 ?
537                             MALI_REGISTER_FILE_FORMAT_U16 :
538                             MALI_REGISTER_FILE_FORMAT_U32;
539    }
540    inputs.bifrost.static_rt_conv = true;
541 #endif
542 
543    struct util_dynarray binary;
544 
545    util_dynarray_init(&binary, NULL);
546    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
547 
548    shader_info->fs.sample_shading = is_ms;
549 
550    mali_ptr shader =
551       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
552                               PAN_ARCH >= 6 ? 128 : 64);
553 
554    util_dynarray_fini(&binary);
555    ralloc_free(b.shader);
556 
557    return shader;
558 }
559 
560 static enum pipe_format
panvk_meta_copy_img_format(enum pipe_format fmt)561 panvk_meta_copy_img_format(enum pipe_format fmt)
562 {
563    /* We can't use a non-compressed format when handling a tiled/AFBC
564     * compressed format because the tile size differ (4x4 blocks for
565     * compressed formats and 16x16 texels for non-compressed ones).
566     */
567    assert(!util_format_is_compressed(fmt));
568 
569    /* Pick blendable formats when we can, otherwise pick the UINT variant
570     * matching the texel size.
571     */
572    switch (util_format_get_blocksize(fmt)) {
573    case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
574    case 12: return PIPE_FORMAT_R32G32B32_UINT;
575    case 8: return PIPE_FORMAT_R32G32_UINT;
576    case 6: return PIPE_FORMAT_R16G16B16_UINT;
577    case 4: return PIPE_FORMAT_R8G8B8A8_UNORM;
578    case 2: return (fmt == PIPE_FORMAT_R5G6B5_UNORM ||
579                    fmt == PIPE_FORMAT_B5G6R5_UNORM) ?
580                   PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UNORM;
581    case 1: return PIPE_FORMAT_R8_UNORM;
582    default: unreachable("Unsupported format\n");
583    }
584 }
585 
586 struct panvk_meta_copy_img2img_format_info {
587    enum pipe_format srcfmt;
588    enum pipe_format dstfmt;
589    unsigned dstmask;
590 };
591 
592 static const struct panvk_meta_copy_img2img_format_info panvk_meta_copy_img2img_fmts[] = {
593    { PIPE_FORMAT_R8_UNORM, PIPE_FORMAT_R8_UNORM, 0x1},
594    { PIPE_FORMAT_R5G6B5_UNORM, PIPE_FORMAT_R5G6B5_UNORM, 0x7},
595    { PIPE_FORMAT_R5G6B5_UNORM, PIPE_FORMAT_R8G8_UNORM, 0x3},
596    { PIPE_FORMAT_R8G8_UNORM, PIPE_FORMAT_R5G6B5_UNORM, 0x7},
597    { PIPE_FORMAT_R8G8_UNORM, PIPE_FORMAT_R8G8_UNORM, 0x3},
598    /* Z24S8(depth) */
599    { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0x7 },
600    /* Z24S8(stencil) */
601    { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0x8 },
602    { PIPE_FORMAT_R8G8B8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, 0xf },
603    { PIPE_FORMAT_R16G16B16_UINT, PIPE_FORMAT_R16G16B16_UINT, 0x7 },
604    { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x3 },
605    /* Z32S8X24(depth) */
606    { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x1 },
607    /* Z32S8X24(stencil) */
608    { PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32_UINT, 0x2 },
609    { PIPE_FORMAT_R32G32B32_UINT, PIPE_FORMAT_R32G32B32_UINT, 0x7 },
610    { PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
611 };
612 
613 static unsigned
panvk_meta_copy_img2img_format_idx(struct panvk_meta_copy_img2img_format_info key)614 panvk_meta_copy_img2img_format_idx(struct panvk_meta_copy_img2img_format_info key)
615 {
616    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2img_fmts) == PANVK_META_COPY_IMG2IMG_NUM_FORMATS);
617 
618    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2img_fmts); i++) {
619       if (!memcmp(&key, &panvk_meta_copy_img2img_fmts[i], sizeof(key)))
620          return i;
621    }
622 
623    unreachable("Invalid image format\n");
624 }
625 
626 static unsigned
panvk_meta_copy_img_mask(enum pipe_format imgfmt,VkImageAspectFlags aspectMask)627 panvk_meta_copy_img_mask(enum pipe_format imgfmt, VkImageAspectFlags aspectMask)
628 {
629    if (aspectMask != VK_IMAGE_ASPECT_DEPTH_BIT &&
630        aspectMask != VK_IMAGE_ASPECT_STENCIL_BIT) {
631       enum pipe_format outfmt = panvk_meta_copy_img_format(imgfmt);
632 
633       return (1 << util_format_get_nr_components(outfmt)) - 1;
634    }
635 
636    switch (imgfmt) {
637    case PIPE_FORMAT_S8_UINT:
638       return 1;
639    case PIPE_FORMAT_Z16_UNORM:
640       return 3;
641    case PIPE_FORMAT_Z16_UNORM_S8_UINT:
642       return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 3 : 8;
643    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
644       return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 7 : 8;
645    case PIPE_FORMAT_Z24X8_UNORM:
646       assert(aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT);
647       return 7;
648    case PIPE_FORMAT_Z32_FLOAT:
649       return 0xf;
650    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
651       return aspectMask == VK_IMAGE_ASPECT_DEPTH_BIT ? 1 : 2;
652    default:
653       unreachable("Invalid depth format\n");
654    }
655 }
656 
657 static void
panvk_meta_copy_img2img(struct panvk_cmd_buffer * cmdbuf,const struct panvk_image * src,const struct panvk_image * dst,const VkImageCopy * region)658 panvk_meta_copy_img2img(struct panvk_cmd_buffer *cmdbuf,
659                         const struct panvk_image *src,
660                         const struct panvk_image *dst,
661                         const VkImageCopy *region)
662 {
663    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
664    struct pan_fb_info *fbinfo = &cmdbuf->state.fb.info;
665    struct panvk_meta_copy_img2img_format_info key = {
666       .srcfmt = panvk_meta_copy_img_format(src->pimage.layout.format),
667       .dstfmt = panvk_meta_copy_img_format(dst->pimage.layout.format),
668       .dstmask = panvk_meta_copy_img_mask(dst->pimage.layout.format,
669                                           region->dstSubresource.aspectMask),
670    };
671 
672    assert(src->pimage.layout.nr_samples == dst->pimage.layout.nr_samples);
673 
674    unsigned texdimidx =
675       panvk_meta_copy_tex_type(src->pimage.layout.dim,
676                                src->pimage.layout.array_size > 1);
677    unsigned fmtidx =
678       panvk_meta_copy_img2img_format_idx(key);
679    unsigned ms = dst->pimage.layout.nr_samples > 1 ? 1 : 0;
680 
681    mali_ptr rsd =
682       cmdbuf->device->physical_device->meta.copy.img2img[ms][texdimidx][fmtidx].rsd;
683 
684    struct pan_image_view srcview = {
685       .format = key.srcfmt,
686       .dim = src->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
687              MALI_TEXTURE_DIMENSION_2D : src->pimage.layout.dim,
688       .image = &src->pimage,
689       .nr_samples = src->pimage.layout.nr_samples,
690       .first_level = region->srcSubresource.mipLevel,
691       .last_level = region->srcSubresource.mipLevel,
692       .first_layer = region->srcSubresource.baseArrayLayer,
693       .last_layer = region->srcSubresource.baseArrayLayer + region->srcSubresource.layerCount - 1,
694       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
695    };
696 
697    struct pan_image_view dstview = {
698       .format = key.dstfmt,
699       .dim = MALI_TEXTURE_DIMENSION_2D,
700       .image = &dst->pimage,
701       .nr_samples = dst->pimage.layout.nr_samples,
702       .first_level = region->dstSubresource.mipLevel,
703       .last_level = region->dstSubresource.mipLevel,
704       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
705    };
706 
707    unsigned minx = MAX2(region->dstOffset.x, 0);
708    unsigned miny = MAX2(region->dstOffset.y, 0);
709    unsigned maxx = MAX2(region->dstOffset.x + region->extent.width - 1, 0);
710    unsigned maxy = MAX2(region->dstOffset.y + region->extent.height - 1, 0);
711 
712    mali_ptr vpd =
713       panvk_per_arch(meta_emit_viewport)(&cmdbuf->desc_pool.base,
714                                          minx, miny, maxx, maxy);
715 
716    float dst_rect[] = {
717       minx, miny, 0.0, 1.0,
718       maxx + 1, miny, 0.0, 1.0,
719       minx, maxy + 1, 0.0, 1.0,
720       maxx + 1, maxy + 1, 0.0, 1.0,
721    };
722 
723    mali_ptr dst_coords =
724       pan_pool_upload_aligned(&cmdbuf->desc_pool.base, dst_rect,
725                               sizeof(dst_rect), 64);
726 
727    /* TODO: don't force preloads of dst resources if unneeded */
728 
729    unsigned width = u_minify(dst->pimage.layout.width, region->dstSubresource.mipLevel);
730    unsigned height = u_minify(dst->pimage.layout.height, region->dstSubresource.mipLevel);
731    cmdbuf->state.fb.crc_valid[0] = false;
732    *fbinfo = (struct pan_fb_info){
733       .width = width,
734       .height = height,
735       .extent.minx = minx & ~31,
736       .extent.miny = miny & ~31,
737       .extent.maxx = MIN2(ALIGN_POT(maxx + 1, 32), width) - 1,
738       .extent.maxy = MIN2(ALIGN_POT(maxy + 1, 32), height) - 1,
739       .nr_samples = dst->pimage.layout.nr_samples,
740       .rt_count = 1,
741       .rts[0].view = &dstview,
742       .rts[0].preload = true,
743       .rts[0].crc_valid = &cmdbuf->state.fb.crc_valid[0],
744    };
745 
746    mali_ptr texture =
747       panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &srcview);
748    mali_ptr sampler =
749       panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
750 
751    panvk_per_arch(cmd_close_batch)(cmdbuf);
752 
753    minx = MAX2(region->srcOffset.x, 0);
754    miny = MAX2(region->srcOffset.y, 0);
755    maxx = MAX2(region->srcOffset.x + region->extent.width - 1, 0);
756    maxy = MAX2(region->srcOffset.y + region->extent.height - 1, 0);
757    assert(region->dstOffset.z >= 0);
758 
759    unsigned first_src_layer = MAX2(0, region->srcOffset.z);
760    unsigned first_dst_layer = MAX2(region->dstSubresource.baseArrayLayer, region->dstOffset.z);
761    unsigned nlayers = MAX2(region->dstSubresource.layerCount, region->extent.depth);
762    for (unsigned l = 0; l < nlayers; l++) {
763       unsigned src_l = l + first_src_layer;
764       float src_rect[] = {
765          minx, miny, src_l, 1.0,
766          maxx + 1, miny, src_l, 1.0,
767          minx, maxy + 1, src_l, 1.0,
768          maxx + 1, maxy + 1, src_l, 1.0,
769       };
770 
771       mali_ptr src_coords =
772          pan_pool_upload_aligned(&cmdbuf->desc_pool.base, src_rect,
773                                  sizeof(src_rect), 64);
774 
775       struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
776 
777       dstview.first_layer = dstview.last_layer = l + first_dst_layer;
778       batch->blit.src = src->pimage.data.bo;
779       batch->blit.dst = dst->pimage.data.bo;
780       panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
781       panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
782       panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
783 
784       mali_ptr tsd, tiler;
785 
786 #if PAN_ARCH >= 6
787       tsd = batch->tls.gpu;
788       tiler = batch->tiler.descs.gpu;
789 #else
790       tsd = batch->fb.desc.gpu;
791       tiler = 0;
792 #endif
793 
794       struct panfrost_ptr job;
795 
796       job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base,
797                                            &batch->scoreboard,
798                                            src_coords, dst_coords,
799                                            texture, sampler, 0, 0,
800                                            vpd, rsd, tsd, tiler);
801 
802       util_dynarray_append(&batch->jobs, void *, job.cpu);
803       panvk_per_arch(cmd_close_batch)(cmdbuf);
804    }
805 }
806 
807 static void
panvk_meta_copy_img2img_init(struct panvk_physical_device * dev,bool is_ms)808 panvk_meta_copy_img2img_init(struct panvk_physical_device *dev, bool is_ms)
809 {
810    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2img_fmts) == PANVK_META_COPY_IMG2IMG_NUM_FORMATS);
811 
812    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2img_fmts); i++) {
813       for (unsigned texdim = 1; texdim <= 3; texdim++) {
814          unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
815          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2img[0]));
816 
817          /* No MSAA on 3D textures */
818          if (texdim == 3 && is_ms) continue;
819 
820          struct pan_shader_info shader_info;
821          mali_ptr shader =
822             panvk_meta_copy_img2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
823                                            panvk_meta_copy_img2img_fmts[i].srcfmt,
824                                            panvk_meta_copy_img2img_fmts[i].dstfmt,
825                                            panvk_meta_copy_img2img_fmts[i].dstmask,
826                                            texdim, false, is_ms, &shader_info);
827          dev->meta.copy.img2img[is_ms][texdimidx][i].rsd =
828             panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
829                                             shader, &shader_info,
830                                             panvk_meta_copy_img2img_fmts[i].dstfmt,
831                                             panvk_meta_copy_img2img_fmts[i].dstmask,
832                                             true);
833          if (texdim == 3)
834             continue;
835 
836          memset(&shader_info, 0, sizeof(shader_info));
837          texdimidx = panvk_meta_copy_tex_type(texdim, true);
838          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2img[0]));
839          shader =
840             panvk_meta_copy_img2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
841                                            panvk_meta_copy_img2img_fmts[i].srcfmt,
842                                            panvk_meta_copy_img2img_fmts[i].dstfmt,
843                                            panvk_meta_copy_img2img_fmts[i].dstmask,
844                                            texdim, true, is_ms, &shader_info);
845          dev->meta.copy.img2img[is_ms][texdimidx][i].rsd =
846             panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
847                                             shader, &shader_info,
848                                             panvk_meta_copy_img2img_fmts[i].dstfmt,
849                                             panvk_meta_copy_img2img_fmts[i].dstmask,
850                                             true);
851       }
852    }
853 }
854 
855 void
panvk_per_arch(CmdCopyImage)856 panvk_per_arch(CmdCopyImage)(VkCommandBuffer commandBuffer,
857                              VkImage srcImage,
858                              VkImageLayout srcImageLayout,
859                              VkImage destImage,
860                              VkImageLayout destImageLayout,
861                              uint32_t regionCount,
862                              const VkImageCopy *pRegions)
863 {
864    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
865    VK_FROM_HANDLE(panvk_image, dst, destImage);
866    VK_FROM_HANDLE(panvk_image, src, srcImage);
867 
868    for (unsigned i = 0; i < regionCount; i++) {
869       panvk_meta_copy_img2img(cmdbuf, src, dst, &pRegions[i]);
870    }
871 }
872 
873 static unsigned
panvk_meta_copy_buf_texelsize(enum pipe_format imgfmt,unsigned mask)874 panvk_meta_copy_buf_texelsize(enum pipe_format imgfmt, unsigned mask)
875 {
876    unsigned imgtexelsz = util_format_get_blocksize(imgfmt);
877    unsigned nbufcomps = util_bitcount(mask);
878 
879    if (nbufcomps == util_format_get_nr_components(imgfmt))
880       return imgtexelsz;
881 
882    /* Special case for Z24 buffers which are not tightly packed */
883    if (mask == 7 && imgtexelsz == 4)
884       return 4;
885 
886    /* Special case for S8 extraction from Z32_S8X24 */
887    if (mask == 2 && imgtexelsz == 8)
888       return 1;
889 
890    unsigned compsz =
891       util_format_get_component_bits(imgfmt, UTIL_FORMAT_COLORSPACE_RGB, 0);
892 
893    assert(!(compsz % 8));
894 
895    return nbufcomps * compsz / 8;
896 }
897 
898 static enum pipe_format
panvk_meta_copy_buf2img_format(enum pipe_format imgfmt)899 panvk_meta_copy_buf2img_format(enum pipe_format imgfmt)
900 {
901    /* Pick blendable formats when we can, and the FLOAT variant matching the
902     * texelsize otherwise.
903     */
904    switch (util_format_get_blocksize(imgfmt)) {
905    case 1: return PIPE_FORMAT_R8_UNORM;
906    /* AFBC stores things differently for RGB565,
907     * we can't simply map to R8G8 in that case */
908    case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
909                    imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
910                   PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UNORM;
911    case 4: return PIPE_FORMAT_R8G8B8A8_UNORM;
912    case 6: return PIPE_FORMAT_R16G16B16_UINT;
913    case 8: return PIPE_FORMAT_R32G32_UINT;
914    case 12: return PIPE_FORMAT_R32G32B32_UINT;
915    case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
916    default: unreachable("Invalid format\n");
917    }
918 }
919 
920 struct panvk_meta_copy_format_info {
921    enum pipe_format imgfmt;
922    unsigned mask;
923 };
924 
925 static const struct panvk_meta_copy_format_info panvk_meta_copy_buf2img_fmts[] = {
926    { PIPE_FORMAT_R8_UNORM, 0x1 },
927    { PIPE_FORMAT_R8G8_UNORM, 0x3 },
928    { PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
929    { PIPE_FORMAT_R8G8B8A8_UNORM, 0xf },
930    { PIPE_FORMAT_R16G16B16_UINT, 0x7 },
931    { PIPE_FORMAT_R32G32_UINT, 0x3 },
932    { PIPE_FORMAT_R32G32B32_UINT, 0x7 },
933    { PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
934    /* S8 -> Z24S8 */
935    { PIPE_FORMAT_R8G8B8A8_UNORM, 0x8 },
936    /* S8 -> Z32_S8X24 */
937    { PIPE_FORMAT_R32G32_UINT, 0x2 },
938    /* Z24X8 -> Z24S8 */
939    { PIPE_FORMAT_R8G8B8A8_UNORM, 0x7 },
940    /* Z32 -> Z32_S8X24 */
941    { PIPE_FORMAT_R32G32_UINT, 0x1 },
942 };
943 
944 struct panvk_meta_copy_buf2img_info {
945    struct {
946       mali_ptr ptr;
947       struct {
948          unsigned line;
949          unsigned surf;
950       } stride;
951    } buf;
952 };
953 
954 #define panvk_meta_copy_buf2img_get_info_field(b, field) \
955         nir_load_ubo((b), 1, \
956                      sizeof(((struct panvk_meta_copy_buf2img_info *)0)->field) * 8, \
957                      nir_imm_int(b, 0), \
958                      nir_imm_int(b, offsetof(struct panvk_meta_copy_buf2img_info, field)), \
959                      .align_mul = 4, \
960                      .align_offset = 0, \
961                      .range_base = 0, \
962                      .range = ~0)
963 
964 static mali_ptr
panvk_meta_copy_buf2img_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct panvk_meta_copy_format_info key,struct pan_shader_info * shader_info)965 panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev,
966                                struct pan_pool *bin_pool,
967                                struct panvk_meta_copy_format_info key,
968                                struct pan_shader_info *shader_info)
969 {
970    nir_builder b =
971       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
972                                      GENX(pan_shader_get_compiler_options)(),
973                                      "panvk_meta_copy_buf2img(imgfmt=%s,mask=%x)",
974                                      util_format_name(key.imgfmt),
975                                      key.mask);
976 
977    b.shader->info.internal = true;
978    b.shader->info.num_ubos = 1;
979 
980    nir_variable *coord_var =
981       nir_variable_create(b.shader, nir_var_shader_in,
982                           glsl_vector_type(GLSL_TYPE_FLOAT, 3),
983                           "coord");
984    coord_var->data.location = VARYING_SLOT_TEX0;
985    nir_ssa_def *coord = nir_load_var(&b, coord_var);
986 
987    coord = nir_f2u32(&b, coord);
988 
989    nir_ssa_def *bufptr =
990       panvk_meta_copy_buf2img_get_info_field(&b, buf.ptr);
991    nir_ssa_def *buflinestride =
992       panvk_meta_copy_buf2img_get_info_field(&b, buf.stride.line);
993    nir_ssa_def *bufsurfstride =
994       panvk_meta_copy_buf2img_get_info_field(&b, buf.stride.surf);
995 
996    unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
997    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
998    unsigned writemask = key.mask;
999 
1000    nir_ssa_def *offset =
1001       nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
1002    offset = nir_iadd(&b, offset,
1003                      nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
1004    offset = nir_iadd(&b, offset,
1005                      nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
1006    bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
1007 
1008    unsigned imgcompsz =
1009       (imgtexelsz <= 4 && key.imgfmt != PIPE_FORMAT_R5G6B5_UNORM) ?
1010       1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
1011 
1012    unsigned nimgcomps = imgtexelsz / imgcompsz;
1013    unsigned bufcompsz = MIN2(buftexelsz, imgcompsz);
1014    unsigned nbufcomps = buftexelsz / bufcompsz;
1015 
1016    assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
1017    assert(nbufcomps <= 4 && nimgcomps <= 4);
1018 
1019    nir_ssa_def *texel =
1020       nir_load_global(&b, bufptr, bufcompsz, nbufcomps, bufcompsz * 8);
1021 
1022    enum glsl_base_type basetype;
1023    if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
1024       texel = nir_vec3(&b,
1025                        nir_iand_imm(&b, texel, BITFIELD_MASK(5)),
1026                        nir_iand_imm(&b, nir_ushr_imm(&b, texel, 5), BITFIELD_MASK(6)),
1027                        nir_iand_imm(&b, nir_ushr_imm(&b, texel, 11), BITFIELD_MASK(5)));
1028       texel = nir_fmul(&b,
1029                        nir_u2f32(&b, texel),
1030                        nir_vec3(&b,
1031                                 nir_imm_float(&b, 1.0f / 31),
1032                                 nir_imm_float(&b, 1.0f / 63),
1033                                 nir_imm_float(&b, 1.0f / 31)));
1034       nimgcomps = 3;
1035       basetype = GLSL_TYPE_FLOAT;
1036    } else if (imgcompsz == 1) {
1037       assert(bufcompsz == 1);
1038       /* Blendable formats are unorm and the fixed-function blend unit
1039        * takes float values.
1040        */
1041       texel = nir_fmul(&b, nir_u2f32(&b, texel),
1042                        nir_imm_float(&b, 1.0f / 255));
1043       basetype = GLSL_TYPE_FLOAT;
1044    } else {
1045       texel = nir_u2uN(&b, texel, imgcompsz * 8);
1046       basetype = imgcompsz == 2 ? GLSL_TYPE_UINT16 : GLSL_TYPE_UINT;
1047    }
1048 
1049    /* We always pass the texel using 32-bit regs for now */
1050    nir_variable *out =
1051       nir_variable_create(b.shader, nir_var_shader_out,
1052                           glsl_vector_type(basetype, nimgcomps),
1053                           "out");
1054    out->data.location = FRAG_RESULT_DATA0;
1055 
1056    uint16_t fullmask = (1 << nimgcomps) - 1;
1057 
1058    assert(fullmask >= writemask);
1059 
1060    if (fullmask != writemask) {
1061       unsigned first_written_comp = ffs(writemask) - 1;
1062       nir_ssa_def *oldtexel = NULL;
1063       if (imgcompsz > 1)
1064          oldtexel = nir_load_var(&b, out);
1065 
1066       nir_ssa_def *texel_comps[4];
1067       for (unsigned i = 0; i < nimgcomps; i++) {
1068          if (writemask & BITFIELD_BIT(i))
1069             texel_comps[i] = nir_channel(&b, texel, i - first_written_comp);
1070          else if (imgcompsz > 1)
1071             texel_comps[i] = nir_channel(&b, oldtexel, i);
1072          else
1073             texel_comps[i] = nir_imm_intN_t(&b, 0, texel->bit_size);
1074       }
1075 
1076       texel = nir_vec(&b, texel_comps, nimgcomps);
1077    }
1078 
1079    nir_store_var(&b, out, texel, 0xff);
1080 
1081    struct panfrost_compile_inputs inputs = {
1082       .gpu_id = pdev->gpu_id,
1083       .is_blit = true,
1084    };
1085 
1086 #if PAN_ARCH >= 6
1087    pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
1088       cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
1089       cfg.register_format = imgcompsz == 2 ?
1090                             MALI_REGISTER_FILE_FORMAT_U16 :
1091                             MALI_REGISTER_FILE_FORMAT_U32;
1092    }
1093    inputs.bifrost.static_rt_conv = true;
1094 #endif
1095 
1096    struct util_dynarray binary;
1097 
1098    util_dynarray_init(&binary, NULL);
1099    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1100 
1101    /* Make sure UBO words have been upgraded to push constants */
1102    assert(shader_info->ubo_count == 1);
1103 
1104    mali_ptr shader =
1105       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1106                               PAN_ARCH >= 6 ? 128 : 64);
1107 
1108    util_dynarray_fini(&binary);
1109    ralloc_free(b.shader);
1110 
1111    return shader;
1112 }
1113 
1114 static unsigned
panvk_meta_copy_buf2img_format_idx(struct panvk_meta_copy_format_info key)1115 panvk_meta_copy_buf2img_format_idx(struct panvk_meta_copy_format_info key)
1116 {
1117    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_buf2img_fmts); i++) {
1118       if (!memcmp(&key, &panvk_meta_copy_buf2img_fmts[i], sizeof(key)))
1119          return i;
1120    }
1121 
1122    unreachable("Invalid image format\n");
1123 }
1124 
1125 static void
panvk_meta_copy_buf2img(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * buf,const struct panvk_image * img,const VkBufferImageCopy * region)1126 panvk_meta_copy_buf2img(struct panvk_cmd_buffer *cmdbuf,
1127                         const struct panvk_buffer *buf,
1128                         const struct panvk_image *img,
1129                         const VkBufferImageCopy *region)
1130 {
1131    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1132    struct pan_fb_info *fbinfo = &cmdbuf->state.fb.info;
1133    unsigned minx = MAX2(region->imageOffset.x, 0);
1134    unsigned miny = MAX2(region->imageOffset.y, 0);
1135    unsigned maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0);
1136    unsigned maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0);
1137 
1138    mali_ptr vpd =
1139       panvk_per_arch(meta_emit_viewport)(&cmdbuf->desc_pool.base,
1140                                          minx, miny, maxx, maxy);
1141 
1142    float dst_rect[] = {
1143       minx, miny, 0.0, 1.0,
1144       maxx + 1, miny, 0.0, 1.0,
1145       minx, maxy + 1, 0.0, 1.0,
1146       maxx + 1, maxy + 1, 0.0, 1.0,
1147    };
1148    mali_ptr dst_coords =
1149       pan_pool_upload_aligned(&cmdbuf->desc_pool.base, dst_rect,
1150                               sizeof(dst_rect), 64);
1151 
1152    struct panvk_meta_copy_format_info key = {
1153       .imgfmt = panvk_meta_copy_buf2img_format(img->pimage.layout.format),
1154       .mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
1155                                        region->imageSubresource.aspectMask),
1156    };
1157 
1158    unsigned fmtidx = panvk_meta_copy_buf2img_format_idx(key);
1159 
1160    mali_ptr rsd =
1161       cmdbuf->device->physical_device->meta.copy.buf2img[fmtidx].rsd;
1162    const struct panfrost_ubo_push *pushmap =
1163       &cmdbuf->device->physical_device->meta.copy.buf2img[fmtidx].pushmap;
1164 
1165    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1166    struct panvk_meta_copy_buf2img_info info = {
1167       .buf.ptr = buf->bo->ptr.gpu + buf->bo_offset + region->bufferOffset,
1168       .buf.stride.line = (region->bufferRowLength ? : region->imageExtent.width) * buftexelsz,
1169    };
1170 
1171    info.buf.stride.surf =
1172       (region->bufferImageHeight ? : region->imageExtent.height) * info.buf.stride.line;
1173 
1174    mali_ptr pushconsts =
1175       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
1176                                           &info, sizeof(info));
1177    mali_ptr ubo =
1178       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
1179 
1180    struct pan_image_view view = {
1181       .format = key.imgfmt,
1182       .dim = MALI_TEXTURE_DIMENSION_2D,
1183       .image = &img->pimage,
1184       .nr_samples = img->pimage.layout.nr_samples,
1185       .first_level = region->imageSubresource.mipLevel,
1186       .last_level = region->imageSubresource.mipLevel,
1187       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
1188    };
1189 
1190    /* TODO: don't force preloads of dst resources if unneeded */
1191    cmdbuf->state.fb.crc_valid[0] = false;
1192    *fbinfo = (struct pan_fb_info){
1193       .width = u_minify(img->pimage.layout.width, region->imageSubresource.mipLevel),
1194       .height = u_minify(img->pimage.layout.height, region->imageSubresource.mipLevel),
1195       .extent.minx = minx,
1196       .extent.maxx = maxx,
1197       .extent.miny = miny,
1198       .extent.maxy = maxy,
1199       .nr_samples = 1,
1200       .rt_count = 1,
1201       .rts[0].view = &view,
1202       .rts[0].preload = true,
1203       .rts[0].crc_valid = &cmdbuf->state.fb.crc_valid[0],
1204    };
1205 
1206    panvk_per_arch(cmd_close_batch)(cmdbuf);
1207 
1208    assert(region->imageSubresource.layerCount == 1 ||
1209           region->imageExtent.depth == 1);
1210    assert(region->imageOffset.z >= 0);
1211    unsigned first_layer = MAX2(region->imageSubresource.baseArrayLayer, region->imageOffset.z);
1212    unsigned nlayers = MAX2(region->imageSubresource.layerCount, region->imageExtent.depth);
1213    for (unsigned l = 0; l < nlayers; l++) {
1214       float src_rect[] = {
1215          0, 0, l, 1.0,
1216          region->imageExtent.width, 0, l, 1.0,
1217          0, region->imageExtent.height, l, 1.0,
1218          region->imageExtent.width, region->imageExtent.height, l, 1.0,
1219       };
1220 
1221       mali_ptr src_coords =
1222          pan_pool_upload_aligned(&cmdbuf->desc_pool.base, src_rect,
1223                                  sizeof(src_rect), 64);
1224 
1225       struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1226 
1227       view.first_layer = view.last_layer = l + first_layer;
1228       batch->blit.src = buf->bo;
1229       batch->blit.dst = img->pimage.data.bo;
1230       panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, true);
1231       panvk_per_arch(cmd_alloc_fb_desc)(cmdbuf);
1232       panvk_per_arch(cmd_prepare_tiler_context)(cmdbuf);
1233 
1234       mali_ptr tsd, tiler;
1235 
1236 #if PAN_ARCH >= 6
1237       tsd = batch->tls.gpu;
1238       tiler = batch->tiler.descs.gpu;
1239 #else
1240       tsd = batch->fb.desc.gpu;
1241       tiler = 0;
1242 #endif
1243 
1244       struct panfrost_ptr job;
1245 
1246       job = panvk_meta_copy_emit_tiler_job(&cmdbuf->desc_pool.base,
1247                                            &batch->scoreboard,
1248                                            src_coords, dst_coords,
1249                                            0, 0, ubo, pushconsts,
1250                                            vpd, rsd, tsd, tiler);
1251 
1252       util_dynarray_append(&batch->jobs, void *, job.cpu);
1253       panvk_per_arch(cmd_close_batch)(cmdbuf);
1254    }
1255 }
1256 
1257 static void
panvk_meta_copy_buf2img_init(struct panvk_physical_device * dev)1258 panvk_meta_copy_buf2img_init(struct panvk_physical_device *dev)
1259 {
1260    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_buf2img_fmts) == PANVK_META_COPY_BUF2IMG_NUM_FORMATS);
1261 
1262    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_buf2img_fmts); i++) {
1263       struct pan_shader_info shader_info;
1264       mali_ptr shader =
1265          panvk_meta_copy_buf2img_shader(&dev->pdev, &dev->meta.bin_pool.base,
1266                                         panvk_meta_copy_buf2img_fmts[i],
1267                                         &shader_info);
1268       dev->meta.copy.buf2img[i].pushmap = shader_info.push;
1269       dev->meta.copy.buf2img[i].rsd =
1270          panvk_meta_copy_to_img_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
1271                                          shader, &shader_info,
1272                                          panvk_meta_copy_buf2img_fmts[i].imgfmt,
1273                                          panvk_meta_copy_buf2img_fmts[i].mask,
1274                                          false);
1275    }
1276 }
1277 
1278 void
panvk_per_arch(CmdCopyBufferToImage)1279 panvk_per_arch(CmdCopyBufferToImage)(VkCommandBuffer commandBuffer,
1280                                      VkBuffer srcBuffer,
1281                                      VkImage destImage,
1282                                      VkImageLayout destImageLayout,
1283                                      uint32_t regionCount,
1284                                      const VkBufferImageCopy *pRegions)
1285 {
1286    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1287    VK_FROM_HANDLE(panvk_buffer, buf, srcBuffer);
1288    VK_FROM_HANDLE(panvk_image, img, destImage);
1289 
1290    for (unsigned i = 0; i < regionCount; i++) {
1291       panvk_meta_copy_buf2img(cmdbuf, buf, img, &pRegions[i]);
1292    }
1293 }
1294 
1295 static const struct panvk_meta_copy_format_info panvk_meta_copy_img2buf_fmts[] = {
1296    { PIPE_FORMAT_R8_UINT, 0x1 },
1297    { PIPE_FORMAT_R8G8_UINT, 0x3 },
1298    { PIPE_FORMAT_R5G6B5_UNORM, 0x7 },
1299    { PIPE_FORMAT_R8G8B8A8_UINT, 0xf },
1300    { PIPE_FORMAT_R16G16B16_UINT, 0x7 },
1301    { PIPE_FORMAT_R32G32_UINT, 0x3 },
1302    { PIPE_FORMAT_R32G32B32_UINT, 0x7 },
1303    { PIPE_FORMAT_R32G32B32A32_UINT, 0xf },
1304    /* S8 -> Z24S8 */
1305    { PIPE_FORMAT_R8G8B8A8_UINT, 0x8 },
1306    /* S8 -> Z32_S8X24 */
1307    { PIPE_FORMAT_R32G32_UINT, 0x2 },
1308    /* Z24X8 -> Z24S8 */
1309    { PIPE_FORMAT_R8G8B8A8_UINT, 0x7 },
1310    /* Z32 -> Z32_S8X24 */
1311    { PIPE_FORMAT_R32G32_UINT, 0x1 },
1312 };
1313 
1314 static enum pipe_format
panvk_meta_copy_img2buf_format(enum pipe_format imgfmt)1315 panvk_meta_copy_img2buf_format(enum pipe_format imgfmt)
1316 {
1317    /* Pick blendable formats when we can, and the FLOAT variant matching the
1318     * texelsize otherwise.
1319     */
1320    switch (util_format_get_blocksize(imgfmt)) {
1321    case 1: return PIPE_FORMAT_R8_UINT;
1322    /* AFBC stores things differently for RGB565,
1323     * we can't simply map to R8G8 in that case */
1324    case 2: return (imgfmt == PIPE_FORMAT_R5G6B5_UNORM ||
1325                    imgfmt == PIPE_FORMAT_B5G6R5_UNORM) ?
1326                   PIPE_FORMAT_R5G6B5_UNORM : PIPE_FORMAT_R8G8_UINT;
1327    case 4: return PIPE_FORMAT_R8G8B8A8_UINT;
1328    case 6: return PIPE_FORMAT_R16G16B16_UINT;
1329    case 8: return PIPE_FORMAT_R32G32_UINT;
1330    case 12: return PIPE_FORMAT_R32G32B32_UINT;
1331    case 16: return PIPE_FORMAT_R32G32B32A32_UINT;
1332    default: unreachable("Invalid format\n");
1333    }
1334 }
1335 
1336 struct panvk_meta_copy_img2buf_info {
1337    struct {
1338       mali_ptr ptr;
1339       struct {
1340          unsigned line;
1341          unsigned surf;
1342       } stride;
1343    } buf;
1344    struct {
1345       struct {
1346          unsigned x, y, z;
1347       } offset;
1348       struct {
1349          unsigned minx, miny, maxx, maxy;
1350       } extent;
1351    } img;
1352 };
1353 
1354 #define panvk_meta_copy_img2buf_get_info_field(b, field) \
1355         nir_load_ubo((b), 1, \
1356                      sizeof(((struct panvk_meta_copy_img2buf_info *)0)->field) * 8, \
1357                      nir_imm_int(b, 0), \
1358                      nir_imm_int(b, offsetof(struct panvk_meta_copy_img2buf_info, field)), \
1359                      .align_mul = 4, \
1360                      .align_offset = 0, \
1361                      .range_base = 0, \
1362                      .range = ~0)
1363 
1364 static mali_ptr
panvk_meta_copy_img2buf_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct panvk_meta_copy_format_info key,unsigned texdim,unsigned texisarray,struct pan_shader_info * shader_info)1365 panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
1366                                struct pan_pool *bin_pool,
1367                                struct panvk_meta_copy_format_info key,
1368                                unsigned texdim, unsigned texisarray,
1369                                struct pan_shader_info *shader_info)
1370 {
1371    unsigned imgtexelsz = util_format_get_blocksize(key.imgfmt);
1372    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1373 
1374    /* FIXME: Won't work on compute queues, but we can't do that with
1375     * a compute shader if the destination is an AFBC surface.
1376     */
1377    nir_builder b =
1378       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1379                                      GENX(pan_shader_get_compiler_options)(),
1380                                      "panvk_meta_copy_img2buf(dim=%dD%s,imgfmt=%s,mask=%x)",
1381                                      texdim, texisarray ? "[]" : "",
1382                                      util_format_name(key.imgfmt),
1383                                      key.mask);
1384 
1385    b.shader->info.internal = true;
1386    b.shader->info.num_ubos = 1;
1387 
1388    nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1389    nir_ssa_def *bufptr =
1390       panvk_meta_copy_img2buf_get_info_field(&b, buf.ptr);
1391    nir_ssa_def *buflinestride =
1392       panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.line);
1393    nir_ssa_def *bufsurfstride =
1394       panvk_meta_copy_img2buf_get_info_field(&b, buf.stride.surf);
1395 
1396    nir_ssa_def *imgminx =
1397       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.minx);
1398    nir_ssa_def *imgminy =
1399       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.miny);
1400    nir_ssa_def *imgmaxx =
1401       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxx);
1402    nir_ssa_def *imgmaxy =
1403       panvk_meta_copy_img2buf_get_info_field(&b, img.extent.maxy);
1404 
1405    nir_ssa_def *imgcoords, *inbounds;
1406 
1407    switch (texdim + texisarray) {
1408    case 1:
1409       imgcoords =
1410          nir_iadd(&b,
1411                   nir_channel(&b, coord, 0),
1412                   panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x));
1413       inbounds =
1414          nir_iand(&b,
1415                   nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1416                   nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx));
1417       break;
1418    case 2:
1419       imgcoords =
1420          nir_vec2(&b,
1421                   nir_iadd(&b,
1422                            nir_channel(&b, coord, 0),
1423                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
1424                   nir_iadd(&b,
1425                            nir_channel(&b, coord, 1),
1426                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
1427       inbounds =
1428          nir_iand(&b,
1429                   nir_iand(&b,
1430                            nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1431                            nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
1432                   nir_iand(&b,
1433                            nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
1434                            nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
1435       break;
1436    case 3:
1437       imgcoords =
1438          nir_vec3(&b,
1439                   nir_iadd(&b,
1440                            nir_channel(&b, coord, 0),
1441                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.x)),
1442                   nir_iadd(&b,
1443                            nir_channel(&b, coord, 1),
1444                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)),
1445                   nir_iadd(&b,
1446                            nir_channel(&b, coord, 2),
1447                            panvk_meta_copy_img2buf_get_info_field(&b, img.offset.y)));
1448       inbounds =
1449          nir_iand(&b,
1450                   nir_iand(&b,
1451                            nir_uge(&b, imgmaxx, nir_channel(&b, imgcoords, 0)),
1452                            nir_uge(&b, imgmaxy, nir_channel(&b, imgcoords, 1))),
1453                   nir_iand(&b,
1454                            nir_uge(&b, nir_channel(&b, imgcoords, 0), imgminx),
1455                            nir_uge(&b, nir_channel(&b, imgcoords, 1), imgminy)));
1456       break;
1457    default:
1458       unreachable("Invalid texture dimension\n");
1459    }
1460 
1461    nir_push_if(&b, inbounds);
1462 
1463    /* FIXME: doesn't work for tiled+compressed formats since blocks are 4x4
1464     * blocks instead of 16x16 texels in that case, and there's nothing we can
1465     * do to force the tile size to 4x4 in the render path.
1466     * This being said, compressed textures are not compatible with AFBC, so we
1467     * could use a compute shader arranging the blocks properly.
1468     */
1469    nir_ssa_def *offset =
1470       nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, buftexelsz));
1471    offset = nir_iadd(&b, offset,
1472                      nir_imul(&b, nir_channel(&b, coord, 1), buflinestride));
1473    offset = nir_iadd(&b, offset,
1474                      nir_imul(&b, nir_channel(&b, coord, 2), bufsurfstride));
1475    bufptr = nir_iadd(&b, bufptr, nir_u2u64(&b, offset));
1476 
1477    unsigned imgcompsz = imgtexelsz <= 4 ?
1478                         1 : MIN2(1 << (ffs(imgtexelsz) - 1), 4);
1479    unsigned nimgcomps = imgtexelsz / imgcompsz;
1480    assert(nimgcomps <= 4);
1481 
1482    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
1483    tex->op = nir_texop_txf;
1484    tex->texture_index = 0;
1485    tex->is_array = texisarray;
1486    tex->dest_type = util_format_is_unorm(key.imgfmt) ?
1487                     nir_type_float32 : nir_type_uint32;
1488 
1489    switch (texdim) {
1490    case 1: tex->sampler_dim = GLSL_SAMPLER_DIM_1D; break;
1491    case 2: tex->sampler_dim = GLSL_SAMPLER_DIM_2D; break;
1492    case 3: tex->sampler_dim = GLSL_SAMPLER_DIM_3D; break;
1493    default: unreachable("Invalid texture dimension");
1494    }
1495 
1496    tex->src[0].src_type = nir_tex_src_coord;
1497    tex->src[0].src = nir_src_for_ssa(imgcoords);
1498    tex->coord_components = texdim + texisarray;
1499    nir_ssa_dest_init(&tex->instr, &tex->dest, 4,
1500                      nir_alu_type_get_type_size(tex->dest_type), NULL);
1501    nir_builder_instr_insert(&b, &tex->instr);
1502 
1503    nir_ssa_def *texel = &tex->dest.ssa;
1504 
1505    unsigned fullmask = (1 << util_format_get_nr_components(key.imgfmt)) - 1;
1506    unsigned nbufcomps = util_bitcount(fullmask);
1507    if (key.mask != fullmask) {
1508       nir_ssa_def *bufcomps[4];
1509       nbufcomps = 0;
1510       for (unsigned i = 0; i < nimgcomps; i++) {
1511          if (key.mask & BITFIELD_BIT(i))
1512             bufcomps[nbufcomps++] = nir_channel(&b, texel, i);
1513       }
1514 
1515       texel = nir_vec(&b, bufcomps, nbufcomps);
1516    }
1517 
1518    unsigned bufcompsz = buftexelsz / nbufcomps;
1519 
1520    if (key.imgfmt == PIPE_FORMAT_R5G6B5_UNORM) {
1521       texel = nir_fmul(&b, texel,
1522                        nir_vec3(&b,
1523                                 nir_imm_float(&b, 31),
1524                                 nir_imm_float(&b, 63),
1525                                 nir_imm_float(&b, 31)));
1526       texel = nir_f2u16(&b, texel);
1527       texel = nir_ior(&b, nir_channel(&b, texel, 0),
1528                       nir_ior(&b,
1529                               nir_ishl(&b, nir_channel(&b, texel, 1), nir_imm_int(&b, 5)),
1530                               nir_ishl(&b, nir_channel(&b, texel, 2), nir_imm_int(&b, 11))));
1531       imgcompsz = 2;
1532       bufcompsz = 2;
1533       nbufcomps = 1;
1534       nimgcomps = 1;
1535    } else if (imgcompsz == 1) {
1536       nir_ssa_def *packed = nir_channel(&b, texel, 0);
1537       for (unsigned i = 1; i < nbufcomps; i++) {
1538          packed = nir_ior(&b, packed,
1539                           nir_ishl(&b, nir_iand_imm(&b, nir_channel(&b, texel, i), 0xff),
1540                                    nir_imm_int(&b, i * 8)));
1541       }
1542       texel = packed;
1543 
1544       bufcompsz = nbufcomps == 3 ? 4 : nbufcomps;
1545       nbufcomps = 1;
1546    }
1547 
1548    assert(bufcompsz == 1 || bufcompsz == 2 || bufcompsz == 4);
1549    assert(nbufcomps <= 4 && nimgcomps <= 4);
1550    texel = nir_u2uN(&b, texel, bufcompsz * 8);
1551 
1552    nir_store_global(&b, bufptr, bufcompsz, texel, (1 << nbufcomps) - 1);
1553    nir_pop_if(&b, NULL);
1554 
1555    struct panfrost_compile_inputs inputs = {
1556       .gpu_id = pdev->gpu_id,
1557       .is_blit = true,
1558    };
1559 
1560    struct util_dynarray binary;
1561 
1562    util_dynarray_init(&binary, NULL);
1563    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1564 
1565    /* Make sure UBO words have been upgraded to push constants and everything
1566     * is at the right place.
1567     */
1568    assert(shader_info->ubo_count == 1);
1569    assert(shader_info->push.count <= (sizeof(struct panvk_meta_copy_img2buf_info) / 4));
1570 
1571    mali_ptr shader =
1572       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1573                               PAN_ARCH >= 6 ? 128 : 64);
1574 
1575    util_dynarray_fini(&binary);
1576    ralloc_free(b.shader);
1577 
1578    return shader;
1579 }
1580 
1581 static unsigned
panvk_meta_copy_img2buf_format_idx(struct panvk_meta_copy_format_info key)1582 panvk_meta_copy_img2buf_format_idx(struct panvk_meta_copy_format_info key)
1583 {
1584    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
1585       if (!memcmp(&key, &panvk_meta_copy_img2buf_fmts[i], sizeof(key)))
1586          return i;
1587    }
1588 
1589    unreachable("Invalid texel size\n");
1590 }
1591 
1592 static void
panvk_meta_copy_img2buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * buf,const struct panvk_image * img,const VkBufferImageCopy * region)1593 panvk_meta_copy_img2buf(struct panvk_cmd_buffer *cmdbuf,
1594                         const struct panvk_buffer *buf,
1595                         const struct panvk_image *img,
1596                         const VkBufferImageCopy *region)
1597 {
1598    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1599    struct panvk_meta_copy_format_info key = {
1600       .imgfmt = panvk_meta_copy_img2buf_format(img->pimage.layout.format),
1601       .mask = panvk_meta_copy_img_mask(img->pimage.layout.format,
1602                                        region->imageSubresource.aspectMask),
1603    };
1604    unsigned buftexelsz = panvk_meta_copy_buf_texelsize(key.imgfmt, key.mask);
1605    unsigned texdimidx =
1606       panvk_meta_copy_tex_type(img->pimage.layout.dim,
1607                                img->pimage.layout.array_size > 1);
1608    unsigned fmtidx = panvk_meta_copy_img2buf_format_idx(key);
1609 
1610    mali_ptr rsd =
1611       cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].rsd;
1612    const struct panfrost_ubo_push *pushmap =
1613       &cmdbuf->device->physical_device->meta.copy.img2buf[texdimidx][fmtidx].pushmap;
1614 
1615    struct panvk_meta_copy_img2buf_info info = {
1616       .buf.ptr = buf->bo->ptr.gpu + buf->bo_offset + region->bufferOffset,
1617       .buf.stride.line = (region->bufferRowLength ? : region->imageExtent.width) * buftexelsz,
1618       .img.offset.x = MAX2(region->imageOffset.x & ~15, 0),
1619       .img.extent.minx = MAX2(region->imageOffset.x, 0),
1620       .img.extent.maxx = MAX2(region->imageOffset.x + region->imageExtent.width - 1, 0),
1621    };
1622 
1623    if (img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D) {
1624       info.img.extent.maxy = region->imageSubresource.layerCount - 1;
1625    } else {
1626       info.img.offset.y = MAX2(region->imageOffset.y & ~15, 0);
1627       info.img.offset.z = MAX2(region->imageOffset.z, 0);
1628       info.img.extent.miny = MAX2(region->imageOffset.y, 0);
1629       info.img.extent.maxy = MAX2(region->imageOffset.y + region->imageExtent.height - 1, 0);
1630    }
1631 
1632    info.buf.stride.surf = (region->bufferImageHeight ? : region->imageExtent.height) *
1633                           info.buf.stride.line;
1634 
1635    mali_ptr pushconsts =
1636       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
1637                                           &info, sizeof(info));
1638    mali_ptr ubo =
1639       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
1640 
1641    struct pan_image_view view = {
1642       .format = key.imgfmt,
1643       .dim = img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_CUBE ?
1644              MALI_TEXTURE_DIMENSION_2D : img->pimage.layout.dim,
1645       .image = &img->pimage,
1646       .nr_samples = img->pimage.layout.nr_samples,
1647       .first_level = region->imageSubresource.mipLevel,
1648       .last_level = region->imageSubresource.mipLevel,
1649       .first_layer = region->imageSubresource.baseArrayLayer,
1650       .last_layer = region->imageSubresource.baseArrayLayer + region->imageSubresource.layerCount - 1,
1651       .swizzle = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
1652    };
1653 
1654    mali_ptr texture =
1655       panvk_meta_copy_img_emit_texture(pdev, &cmdbuf->desc_pool.base, &view);
1656    mali_ptr sampler =
1657       panvk_meta_copy_img_emit_sampler(pdev, &cmdbuf->desc_pool.base);
1658 
1659    panvk_per_arch(cmd_close_batch)(cmdbuf);
1660 
1661    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1662 
1663    struct pan_tls_info tlsinfo = { 0 };
1664 
1665    batch->blit.src = img->pimage.data.bo;
1666    batch->blit.dst = buf->bo;
1667    batch->tls =
1668       pan_pool_alloc_desc(&cmdbuf->desc_pool.base, LOCAL_STORAGE);
1669    GENX(pan_emit_tls)(&tlsinfo, batch->tls.cpu);
1670 
1671    mali_ptr tsd = batch->tls.gpu;
1672 
1673    struct pan_compute_dim wg_sz = {
1674       16,
1675       img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ? 1 : 16,
1676       1,
1677    };
1678 
1679    struct pan_compute_dim num_wg = {
1680      (ALIGN_POT(info.img.extent.maxx + 1, 16) - info.img.offset.x) / 16,
1681      img->pimage.layout.dim == MALI_TEXTURE_DIMENSION_1D ?
1682         region->imageSubresource.layerCount :
1683         (ALIGN_POT(info.img.extent.maxy + 1, 16) - info.img.offset.y) / 16,
1684      img->pimage.layout.dim != MALI_TEXTURE_DIMENSION_1D ?
1685         MAX2(region->imageSubresource.layerCount, region->imageExtent.depth) : 1,
1686    };
1687 
1688    struct panfrost_ptr job =
1689       panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1690                                        &batch->scoreboard, &num_wg, &wg_sz,
1691                                        texture, sampler,
1692                                        ubo, pushconsts,
1693                                        rsd, tsd);
1694 
1695    util_dynarray_append(&batch->jobs, void *, job.cpu);
1696 
1697    panvk_per_arch(cmd_close_batch)(cmdbuf);
1698 }
1699 
1700 static void
panvk_meta_copy_img2buf_init(struct panvk_physical_device * dev)1701 panvk_meta_copy_img2buf_init(struct panvk_physical_device *dev)
1702 {
1703    STATIC_ASSERT(ARRAY_SIZE(panvk_meta_copy_img2buf_fmts) == PANVK_META_COPY_IMG2BUF_NUM_FORMATS);
1704 
1705    for (unsigned i = 0; i < ARRAY_SIZE(panvk_meta_copy_img2buf_fmts); i++) {
1706       for (unsigned texdim = 1; texdim <= 3; texdim++) {
1707          unsigned texdimidx = panvk_meta_copy_tex_type(texdim, false);
1708          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
1709 
1710          struct pan_shader_info shader_info;
1711          mali_ptr shader =
1712             panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1713                                            panvk_meta_copy_img2buf_fmts[i],
1714                                            texdim, false, &shader_info);
1715          dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
1716          dev->meta.copy.img2buf[texdimidx][i].rsd =
1717             panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
1718                                             &dev->meta.desc_pool.base,
1719                                             shader, &shader_info, true);
1720 
1721          if (texdim == 3)
1722             continue;
1723 
1724          memset(&shader_info, 0, sizeof(shader_info));
1725          texdimidx = panvk_meta_copy_tex_type(texdim, true);
1726          assert(texdimidx < ARRAY_SIZE(dev->meta.copy.img2buf));
1727          shader =
1728             panvk_meta_copy_img2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1729                                            panvk_meta_copy_img2buf_fmts[i],
1730                                            texdim, true, &shader_info);
1731          dev->meta.copy.img2buf[texdimidx][i].pushmap = shader_info.push;
1732          dev->meta.copy.img2buf[texdimidx][i].rsd =
1733             panvk_meta_copy_to_buf_emit_rsd(&dev->pdev,
1734                                             &dev->meta.desc_pool.base,
1735                                             shader, &shader_info, true);
1736       }
1737    }
1738 }
1739 
1740 void
panvk_per_arch(CmdCopyImageToBuffer)1741 panvk_per_arch(CmdCopyImageToBuffer)(VkCommandBuffer commandBuffer,
1742                                      VkImage srcImage,
1743                                      VkImageLayout srcImageLayout,
1744                                      VkBuffer destBuffer,
1745                                      uint32_t regionCount,
1746                                      const VkBufferImageCopy *pRegions)
1747 {
1748    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1749    VK_FROM_HANDLE(panvk_buffer, buf, destBuffer);
1750    VK_FROM_HANDLE(panvk_image, img, srcImage);
1751 
1752    for (unsigned i = 0; i < regionCount; i++) {
1753       panvk_meta_copy_img2buf(cmdbuf, buf, img, &pRegions[i]);
1754    }
1755 }
1756 
1757 struct panvk_meta_copy_buf2buf_info {
1758    mali_ptr src;
1759    mali_ptr dst;
1760 };
1761 
1762 #define panvk_meta_copy_buf2buf_get_info_field(b, field) \
1763         nir_load_ubo((b), 1, \
1764                      sizeof(((struct panvk_meta_copy_buf2buf_info *)0)->field) * 8, \
1765                      nir_imm_int(b, 0), \
1766                      nir_imm_int(b, offsetof(struct panvk_meta_copy_buf2buf_info, field)), \
1767                      .align_mul = 4, \
1768                      .align_offset = 0, \
1769                      .range_base = 0, \
1770                      .range = ~0)
1771 
1772 static mali_ptr
panvk_meta_copy_buf2buf_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,unsigned blksz,struct pan_shader_info * shader_info)1773 panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev,
1774                                struct pan_pool *bin_pool,
1775                                unsigned blksz,
1776                                struct pan_shader_info *shader_info)
1777 {
1778    /* FIXME: Won't work on compute queues, but we can't do that with
1779     * a compute shader if the destination is an AFBC surface.
1780     */
1781    nir_builder b =
1782       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1783                                      GENX(pan_shader_get_compiler_options)(),
1784                                      "panvk_meta_copy_buf2buf(blksz=%d)",
1785                                      blksz);
1786 
1787    b.shader->info.internal = true;
1788    b.shader->info.num_ubos = 1;
1789 
1790    nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1791 
1792    nir_ssa_def *offset =
1793       nir_u2u64(&b, nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, blksz)));
1794    nir_ssa_def *srcptr =
1795       nir_iadd(&b, panvk_meta_copy_buf2buf_get_info_field(&b, src), offset);
1796    nir_ssa_def *dstptr =
1797       nir_iadd(&b, panvk_meta_copy_buf2buf_get_info_field(&b, dst), offset);
1798 
1799    unsigned compsz = blksz < 4 ? blksz : 4;
1800    unsigned ncomps = blksz / compsz;
1801    nir_store_global(&b, dstptr, blksz,
1802                     nir_load_global(&b, srcptr, blksz, ncomps, compsz * 8),
1803                     (1 << ncomps) - 1);
1804 
1805    struct panfrost_compile_inputs inputs = {
1806       .gpu_id = pdev->gpu_id,
1807       .is_blit = true,
1808    };
1809 
1810    struct util_dynarray binary;
1811 
1812    util_dynarray_init(&binary, NULL);
1813    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1814 
1815    /* Make sure UBO words have been upgraded to push constants and everything
1816     * is at the right place.
1817     */
1818    assert(shader_info->ubo_count == 1);
1819    assert(shader_info->push.count == (sizeof(struct panvk_meta_copy_buf2buf_info) / 4));
1820 
1821    mali_ptr shader =
1822       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1823                               PAN_ARCH >= 6 ? 128 : 64);
1824 
1825    util_dynarray_fini(&binary);
1826    ralloc_free(b.shader);
1827 
1828    return shader;
1829 }
1830 
1831 static void
panvk_meta_copy_buf2buf_init(struct panvk_physical_device * dev)1832 panvk_meta_copy_buf2buf_init(struct panvk_physical_device *dev)
1833 {
1834    for (unsigned i = 0; i < ARRAY_SIZE(dev->meta.copy.buf2buf); i++) {
1835       struct pan_shader_info shader_info;
1836       mali_ptr shader =
1837          panvk_meta_copy_buf2buf_shader(&dev->pdev, &dev->meta.bin_pool.base,
1838                                         1 << i, &shader_info);
1839       dev->meta.copy.buf2buf[i].pushmap = shader_info.push;
1840       dev->meta.copy.buf2buf[i].rsd =
1841          panvk_meta_copy_to_buf_emit_rsd(&dev->pdev, &dev->meta.desc_pool.base,
1842                                          shader, &shader_info, false);
1843    }
1844 }
1845 
1846 static void
panvk_meta_copy_buf2buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * src,const struct panvk_buffer * dst,const VkBufferCopy * region)1847 panvk_meta_copy_buf2buf(struct panvk_cmd_buffer *cmdbuf,
1848                         const struct panvk_buffer *src,
1849                         const struct panvk_buffer *dst,
1850                         const VkBufferCopy *region)
1851 {
1852    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
1853 
1854    struct panvk_meta_copy_buf2buf_info info = {
1855       .src = src->bo->ptr.gpu + src->bo_offset + region->srcOffset,
1856       .dst = dst->bo->ptr.gpu + dst->bo_offset + region->dstOffset,
1857    };
1858 
1859    unsigned alignment = ffs((info.src | info.dst | region->size) & 15);
1860    unsigned log2blksz = alignment ? alignment - 1 : 4;
1861 
1862    assert(log2blksz < ARRAY_SIZE(cmdbuf->device->physical_device->meta.copy.buf2buf));
1863    mali_ptr rsd =
1864       cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].rsd;
1865    const struct panfrost_ubo_push *pushmap =
1866       &cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].pushmap;
1867 
1868    mali_ptr pushconsts =
1869       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
1870                                           &info, sizeof(info));
1871    mali_ptr ubo =
1872       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
1873 
1874    panvk_per_arch(cmd_close_batch)(cmdbuf);
1875 
1876    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
1877 
1878    panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
1879 
1880    mali_ptr tsd = batch->tls.gpu;
1881 
1882    unsigned nblocks = region->size >> log2blksz;
1883    struct pan_compute_dim num_wg = { nblocks, 1, 1 };
1884    struct pan_compute_dim wg_sz = { 1, 1, 1};
1885    struct panfrost_ptr job =
1886      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
1887                                       &batch->scoreboard,
1888                                       &num_wg, &wg_sz,
1889                                       0, 0, ubo, pushconsts, rsd, tsd);
1890 
1891    util_dynarray_append(&batch->jobs, void *, job.cpu);
1892 
1893    batch->blit.src = src->bo;
1894    batch->blit.dst = dst->bo;
1895    panvk_per_arch(cmd_close_batch)(cmdbuf);
1896 }
1897 
1898 void
panvk_per_arch(CmdCopyBuffer)1899 panvk_per_arch(CmdCopyBuffer)(VkCommandBuffer commandBuffer,
1900                               VkBuffer srcBuffer,
1901                               VkBuffer destBuffer,
1902                               uint32_t regionCount,
1903                               const VkBufferCopy *pRegions)
1904 {
1905    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1906    VK_FROM_HANDLE(panvk_buffer, src, srcBuffer);
1907    VK_FROM_HANDLE(panvk_buffer, dst, destBuffer);
1908 
1909    for (unsigned i = 0; i < regionCount; i++) {
1910       panvk_meta_copy_buf2buf(cmdbuf, src, dst, &pRegions[i]);
1911    }
1912 }
1913 
1914 struct panvk_meta_fill_buf_info {
1915    mali_ptr start;
1916    uint32_t val;
1917 };
1918 
1919 #define panvk_meta_fill_buf_get_info_field(b, field) \
1920         nir_load_ubo((b), 1, \
1921                      sizeof(((struct panvk_meta_fill_buf_info *)0)->field) * 8, \
1922                      nir_imm_int(b, 0), \
1923                      nir_imm_int(b, offsetof(struct panvk_meta_fill_buf_info, field)), \
1924                      .align_mul = 4, \
1925                      .align_offset = 0, \
1926                      .range_base = 0, \
1927                      .range = ~0)
1928 
1929 static mali_ptr
panvk_meta_fill_buf_shader(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct pan_shader_info * shader_info)1930 panvk_meta_fill_buf_shader(struct panfrost_device *pdev,
1931                            struct pan_pool *bin_pool,
1932                            struct pan_shader_info *shader_info)
1933 {
1934    /* FIXME: Won't work on compute queues, but we can't do that with
1935     * a compute shader if the destination is an AFBC surface.
1936     */
1937    nir_builder b =
1938       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1939                                      GENX(pan_shader_get_compiler_options)(),
1940                                      "panvk_meta_fill_buf()");
1941 
1942    b.shader->info.internal = true;
1943    b.shader->info.num_ubos = 1;
1944 
1945    nir_ssa_def *coord = nir_load_global_invocation_id(&b, 32);
1946 
1947    nir_ssa_def *offset =
1948       nir_u2u64(&b, nir_imul(&b, nir_channel(&b, coord, 0), nir_imm_int(&b, sizeof(uint32_t))));
1949    nir_ssa_def *ptr =
1950       nir_iadd(&b, panvk_meta_fill_buf_get_info_field(&b, start), offset);
1951    nir_ssa_def *val = panvk_meta_fill_buf_get_info_field(&b, val);
1952 
1953    nir_store_global(&b, ptr, sizeof(uint32_t), val, 1);
1954 
1955    struct panfrost_compile_inputs inputs = {
1956       .gpu_id = pdev->gpu_id,
1957       .is_blit = true,
1958    };
1959 
1960    struct util_dynarray binary;
1961 
1962    util_dynarray_init(&binary, NULL);
1963    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
1964 
1965    /* Make sure UBO words have been upgraded to push constants and everything
1966     * is at the right place.
1967     */
1968    assert(shader_info->ubo_count == 1);
1969    assert(shader_info->push.count == 3);
1970 
1971    mali_ptr shader =
1972       pan_pool_upload_aligned(bin_pool, binary.data, binary.size,
1973                               PAN_ARCH >= 6 ? 128 : 64);
1974 
1975    util_dynarray_fini(&binary);
1976    ralloc_free(b.shader);
1977 
1978    return shader;
1979 }
1980 
1981 static mali_ptr
panvk_meta_fill_buf_emit_rsd(struct panfrost_device * pdev,struct pan_pool * bin_pool,struct pan_pool * desc_pool,struct panfrost_ubo_push * pushmap)1982 panvk_meta_fill_buf_emit_rsd(struct panfrost_device *pdev,
1983                              struct pan_pool *bin_pool,
1984                              struct pan_pool *desc_pool,
1985                              struct panfrost_ubo_push *pushmap)
1986 {
1987    struct pan_shader_info shader_info;
1988 
1989    mali_ptr shader =
1990       panvk_meta_fill_buf_shader(pdev, bin_pool, &shader_info);
1991 
1992    struct panfrost_ptr rsd_ptr =
1993       pan_pool_alloc_desc_aggregate(desc_pool,
1994                                     PAN_DESC(RENDERER_STATE));
1995 
1996    pan_pack(rsd_ptr.cpu, RENDERER_STATE, cfg) {
1997       pan_shader_prepare_rsd(&shader_info, shader, &cfg);
1998    }
1999 
2000    *pushmap = shader_info.push;
2001    return rsd_ptr.gpu;
2002 }
2003 
2004 static void
panvk_meta_fill_buf_init(struct panvk_physical_device * dev)2005 panvk_meta_fill_buf_init(struct panvk_physical_device *dev)
2006 {
2007    dev->meta.copy.fillbuf.rsd =
2008       panvk_meta_fill_buf_emit_rsd(&dev->pdev, &dev->meta.bin_pool.base,
2009                                    &dev->meta.desc_pool.base,
2010                                    &dev->meta.copy.fillbuf.pushmap);
2011 }
2012 
2013 static void
panvk_meta_fill_buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * dst,VkDeviceSize size,VkDeviceSize offset,uint32_t val)2014 panvk_meta_fill_buf(struct panvk_cmd_buffer *cmdbuf,
2015                     const struct panvk_buffer *dst,
2016                     VkDeviceSize size, VkDeviceSize offset,
2017                     uint32_t val)
2018 {
2019    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
2020 
2021    if (size == VK_WHOLE_SIZE)
2022       size = (dst->size - offset) & ~3ULL;
2023 
2024    struct panvk_meta_fill_buf_info info = {
2025       .start = dst->bo->ptr.gpu + dst->bo_offset + offset,
2026       .val = val,
2027    };
2028 
2029    assert(!(offset & 3) && !(size & 3));
2030 
2031    unsigned nwords = size / sizeof(uint32_t);
2032    mali_ptr rsd =
2033       cmdbuf->device->physical_device->meta.copy.fillbuf.rsd;
2034    const struct panfrost_ubo_push *pushmap =
2035       &cmdbuf->device->physical_device->meta.copy.fillbuf.pushmap;
2036 
2037    mali_ptr pushconsts =
2038       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
2039                                           &info, sizeof(info));
2040    mali_ptr ubo =
2041       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
2042 
2043    panvk_per_arch(cmd_close_batch)(cmdbuf);
2044 
2045    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
2046 
2047    panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
2048 
2049    mali_ptr tsd = batch->tls.gpu;
2050 
2051    struct pan_compute_dim num_wg = { nwords, 1, 1 };
2052    struct pan_compute_dim wg_sz = { 1, 1, 1};
2053    struct panfrost_ptr job =
2054      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
2055                                       &batch->scoreboard,
2056                                       &num_wg, &wg_sz,
2057                                       0, 0, ubo, pushconsts, rsd, tsd);
2058 
2059    util_dynarray_append(&batch->jobs, void *, job.cpu);
2060 
2061    batch->blit.dst = dst->bo;
2062    panvk_per_arch(cmd_close_batch)(cmdbuf);
2063 }
2064 
2065 void
panvk_per_arch(CmdFillBuffer)2066 panvk_per_arch(CmdFillBuffer)(VkCommandBuffer commandBuffer,
2067                               VkBuffer dstBuffer,
2068                               VkDeviceSize dstOffset,
2069                               VkDeviceSize fillSize,
2070                               uint32_t data)
2071 {
2072    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2073    VK_FROM_HANDLE(panvk_buffer, dst, dstBuffer);
2074 
2075    panvk_meta_fill_buf(cmdbuf, dst, fillSize, dstOffset, data);
2076 }
2077 
2078 static void
panvk_meta_update_buf(struct panvk_cmd_buffer * cmdbuf,const struct panvk_buffer * dst,VkDeviceSize offset,VkDeviceSize size,const void * data)2079 panvk_meta_update_buf(struct panvk_cmd_buffer *cmdbuf,
2080                       const struct panvk_buffer *dst, VkDeviceSize offset,
2081                       VkDeviceSize size, const void *data)
2082 {
2083    struct panfrost_device *pdev = &cmdbuf->device->physical_device->pdev;
2084 
2085    struct panvk_meta_copy_buf2buf_info info = {
2086       .src = pan_pool_upload_aligned(&cmdbuf->desc_pool.base, data, size, 4),
2087       .dst = dst->bo->ptr.gpu + dst->bo_offset + offset,
2088    };
2089 
2090    unsigned log2blksz = ffs(sizeof(uint32_t)) - 1;
2091 
2092    mali_ptr rsd =
2093       cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].rsd;
2094    const struct panfrost_ubo_push *pushmap =
2095       &cmdbuf->device->physical_device->meta.copy.buf2buf[log2blksz].pushmap;
2096 
2097    mali_ptr pushconsts =
2098       panvk_meta_copy_emit_push_constants(pdev, pushmap, &cmdbuf->desc_pool.base,
2099                                           &info, sizeof(info));
2100    mali_ptr ubo =
2101       panvk_meta_copy_emit_ubo(pdev, &cmdbuf->desc_pool.base, &info, sizeof(info));
2102 
2103    panvk_per_arch(cmd_close_batch)(cmdbuf);
2104 
2105    struct panvk_batch *batch = panvk_cmd_open_batch(cmdbuf);
2106 
2107    panvk_per_arch(cmd_alloc_tls_desc)(cmdbuf, false);
2108 
2109    mali_ptr tsd = batch->tls.gpu;
2110 
2111    unsigned nblocks = size >> log2blksz;
2112    struct pan_compute_dim num_wg = { nblocks, 1, 1 };
2113    struct pan_compute_dim wg_sz = { 1, 1, 1};
2114    struct panfrost_ptr job =
2115      panvk_meta_copy_emit_compute_job(&cmdbuf->desc_pool.base,
2116                                       &batch->scoreboard,
2117                                       &num_wg, &wg_sz,
2118                                       0, 0, ubo, pushconsts, rsd, tsd);
2119 
2120    util_dynarray_append(&batch->jobs, void *, job.cpu);
2121 
2122    batch->blit.dst = dst->bo;
2123    panvk_per_arch(cmd_close_batch)(cmdbuf);
2124 }
2125 
2126 void
panvk_per_arch(CmdUpdateBuffer)2127 panvk_per_arch(CmdUpdateBuffer)(VkCommandBuffer commandBuffer,
2128                                 VkBuffer dstBuffer,
2129                                 VkDeviceSize dstOffset,
2130                                 VkDeviceSize dataSize,
2131                                 const void *pData)
2132 {
2133    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2134    VK_FROM_HANDLE(panvk_buffer, dst, dstBuffer);
2135 
2136    panvk_meta_update_buf(cmdbuf, dst, dstOffset, dataSize, pData);
2137 }
2138 
2139 void
panvk_per_arch(meta_copy_init)2140 panvk_per_arch(meta_copy_init)(struct panvk_physical_device *dev)
2141 {
2142    panvk_meta_copy_img2img_init(dev, false);
2143    panvk_meta_copy_img2img_init(dev, true);
2144    panvk_meta_copy_buf2img_init(dev);
2145    panvk_meta_copy_img2buf_init(dev);
2146    panvk_meta_copy_buf2buf_init(dev);
2147    panvk_meta_fill_buf_init(dev);
2148 }
2149