1 /*
2  * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Jonathan Marek <jonathan@marek.ca>
25  */
26 
27 #include "ir2_private.h"
28 
29 static unsigned
src_swizzle(struct ir2_context * ctx,struct ir2_src * src,unsigned ncomp)30 src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
31 {
32    struct ir2_reg_component *comps;
33    unsigned swiz = 0;
34 
35    switch (src->type) {
36    case IR2_SRC_SSA:
37    case IR2_SRC_REG:
38       break;
39    default:
40       return src->swizzle;
41    }
42    /* we need to take into account where the components were allocated */
43    comps = get_reg_src(ctx, src)->comp;
44    for (int i = 0; i < ncomp; i++) {
45       swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
46    }
47    return swiz;
48 }
49 
50 /* alu instr need to take into how the output components are allocated */
51 
52 /* scalar doesn't need to take into account dest swizzle */
53 
54 static unsigned
alu_swizzle_scalar(struct ir2_context * ctx,struct ir2_src * reg)55 alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
56 {
57    /* hardware seems to take from W, but swizzle everywhere just in case */
58    return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
59 }
60 
61 static unsigned
alu_swizzle(struct ir2_context * ctx,struct ir2_instr * instr,struct ir2_src * src)62 alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr,
63             struct ir2_src *src)
64 {
65    struct ir2_reg_component *comp = get_reg(instr)->comp;
66    unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
67    unsigned swiz = 0;
68 
69    /* non per component special cases */
70    switch (instr->alu.vector_opc) {
71    case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
72       return alu_swizzle_scalar(ctx, src);
73    case DOT2ADDv:
74    case DOT3v:
75    case DOT4v:
76    case CUBEv:
77       return swiz0;
78    default:
79       break;
80    }
81 
82    for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
83       if (instr->alu.write_mask & 1 << j) {
84          if (comp[j].c != 7)
85             swiz |= swiz_set(i, comp[j].c);
86          i++;
87       }
88    }
89    return swiz_merge(swiz0, swiz);
90 }
91 
92 static unsigned
alu_swizzle_scalar2(struct ir2_context * ctx,struct ir2_src * src,unsigned s1)93 alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
94 {
95    /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
96    unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
97    return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
98 }
99 
100 /* write_mask needs to be transformed by allocation information */
101 
102 static unsigned
alu_write_mask(struct ir2_context * ctx,struct ir2_instr * instr)103 alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
104 {
105    struct ir2_reg_component *comp = get_reg(instr)->comp;
106    unsigned write_mask = 0;
107 
108    for (int i = 0; i < 4; i++) {
109       if (instr->alu.write_mask & 1 << i)
110          write_mask |= 1 << comp[i].c;
111    }
112 
113    return write_mask;
114 }
115 
116 /* fetch instructions can swizzle dest, but src swizzle needs conversion */
117 
118 static unsigned
fetch_swizzle(struct ir2_context * ctx,struct ir2_src * src,unsigned ncomp)119 fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
120 {
121    unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
122    unsigned swiz = 0;
123    for (int i = 0; i < ncomp; i++)
124       swiz |= swiz_get(alu_swiz, i) << i * 2;
125    return swiz;
126 }
127 
128 static unsigned
fetch_dst_swiz(struct ir2_context * ctx,struct ir2_instr * instr)129 fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
130 {
131    struct ir2_reg_component *comp = get_reg(instr)->comp;
132    unsigned dst_swiz = 0xfff;
133    for (int i = 0; i < dst_ncomp(instr); i++) {
134       dst_swiz &= ~(7 << comp[i].c * 3);
135       dst_swiz |= i << comp[i].c * 3;
136    }
137    return dst_swiz;
138 }
139 
140 /* register / export # for instr */
141 static unsigned
dst_to_reg(struct ir2_context * ctx,struct ir2_instr * instr)142 dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
143 {
144    if (is_export(instr))
145       return instr->alu.export;
146 
147    return get_reg(instr)->idx;
148 }
149 
150 /* register # for src */
151 static unsigned
src_to_reg(struct ir2_context * ctx,struct ir2_src * src)152 src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
153 {
154    return get_reg_src(ctx, src)->idx;
155 }
156 
157 static unsigned
src_reg_byte(struct ir2_context * ctx,struct ir2_src * src)158 src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
159 {
160    if (src->type == IR2_SRC_CONST) {
161       assert(!src->abs); /* no abs bit for const */
162       return src->num;
163    }
164    return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
165 }
166 
167 /* produce the 12 byte binary instruction for a given sched_instr */
168 static void
fill_instr(struct ir2_context * ctx,struct ir2_sched_instr * sched,instr_t * bc,bool * is_fetch)169 fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc,
170            bool *is_fetch)
171 {
172    struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
173 
174    *bc = (instr_t){};
175 
176    if (instr && instr->type == IR2_FETCH) {
177       *is_fetch = true;
178 
179       bc->fetch.opc = instr->fetch.opc;
180       bc->fetch.pred_select = !!instr->pred;
181       bc->fetch.pred_condition = instr->pred & 1;
182 
183       struct ir2_src *src = instr->src;
184 
185       if (instr->fetch.opc == VTX_FETCH) {
186          instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
187 
188          assert(instr->fetch.vtx.const_idx <= 0x1f);
189          assert(instr->fetch.vtx.const_idx_sel <= 0x3);
190 
191          vtx->src_reg = src_to_reg(ctx, src);
192          vtx->src_swiz = fetch_swizzle(ctx, src, 1);
193          vtx->dst_reg = dst_to_reg(ctx, instr);
194          vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
195 
196          vtx->must_be_one = 1;
197          vtx->const_index = instr->fetch.vtx.const_idx;
198          vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
199 
200          /* other fields will be patched */
201 
202          /* XXX seems like every FETCH but the first has
203           * this bit set:
204           */
205          vtx->reserved3 = instr->idx ? 0x1 : 0x0;
206          vtx->reserved0 = instr->idx ? 0x2 : 0x3;
207       } else if (instr->fetch.opc == TEX_FETCH) {
208          instr_fetch_tex_t *tex = &bc->fetch.tex;
209 
210          tex->src_reg = src_to_reg(ctx, src);
211          tex->src_swiz = fetch_swizzle(ctx, src, 3);
212          tex->dst_reg = dst_to_reg(ctx, instr);
213          tex->dst_swiz = fetch_dst_swiz(ctx, instr);
214          /* tex->const_idx = patch_fetches */
215          tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
216          tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
217          tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
218          tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
219          tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
220          tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
221          tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
222          tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
223          tex->use_reg_lod = instr->src_count == 2;
224          tex->sample_location = SAMPLE_CENTER;
225          tex->tx_coord_denorm = instr->fetch.tex.is_rect;
226       } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
227          instr_fetch_tex_t *tex = &bc->fetch.tex;
228 
229          tex->src_reg = src_to_reg(ctx, src);
230          tex->src_swiz = fetch_swizzle(ctx, src, 1);
231          tex->dst_reg = 0;
232          tex->dst_swiz = 0xfff;
233 
234          tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
235          tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
236          tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
237          tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
238          tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
239          tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
240          tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
241          tex->use_comp_lod = 1;
242          tex->use_reg_lod = 0;
243          tex->sample_location = SAMPLE_CENTER;
244       } else {
245          assert(0);
246       }
247       return;
248    }
249 
250    instr_v = sched->instr;
251    instr_s = sched->instr_s;
252 
253    if (instr_v) {
254       struct ir2_src src1, src2, *src3;
255 
256       src1 = instr_v->src[0];
257       src2 = instr_v->src[instr_v->src_count > 1];
258       src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
259 
260       bc->alu.vector_opc = instr_v->alu.vector_opc;
261       bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
262       bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
263       bc->alu.vector_clamp = instr_v->alu.saturate;
264       bc->alu.export_data = instr_v->alu.export >= 0;
265 
266       /* single operand SETEv, use 0.0f as src2 */
267       if (instr_v->src_count == 1 &&
268           (bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv ||
269            bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv))
270          src2 = ir2_zero(ctx);
271 
272       /* export32 instr for a20x hw binning has this bit set..
273        * it seems to do more than change the base address of constants
274        * XXX this is a hack
275        */
276       bc->alu.relative_addr =
277          (bc->alu.export_data && bc->alu.vector_dest == 32);
278 
279       bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
280       bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
281       bc->alu.src1_reg_negate = src1.negate;
282       bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
283 
284       bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
285       bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
286       bc->alu.src2_reg_negate = src2.negate;
287       bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
288 
289       if (src3) {
290          bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
291          bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
292          bc->alu.src3_reg_negate = src3->negate;
293          bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
294       }
295 
296       bc->alu.pred_select = instr_v->pred;
297    }
298 
299    if (instr_s) {
300       struct ir2_src *src = instr_s->src;
301 
302       bc->alu.scalar_opc = instr_s->alu.scalar_opc;
303       bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
304       bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
305       bc->alu.scalar_clamp = instr_s->alu.saturate;
306       bc->alu.export_data = instr_s->alu.export >= 0;
307 
308       if (instr_s->src_count == 1) {
309          bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
310          bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
311          bc->alu.src3_reg_negate = src->negate;
312          bc->alu.src3_sel = src->type != IR2_SRC_CONST;
313       } else {
314          assert(instr_s->src_count == 2);
315 
316          bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
317          bc->alu.src3_swiz =
318             alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
319          bc->alu.src3_reg_negate = src->negate;
320          bc->alu.src3_sel = src->type != IR2_SRC_CONST;
321          ;
322       }
323 
324       if (instr_v)
325          assert(instr_s->pred == instr_v->pred);
326       bc->alu.pred_select = instr_s->pred;
327    }
328 
329    *is_fetch = false;
330    return;
331 }
332 
333 static unsigned
write_cfs(struct ir2_context * ctx,instr_cf_t * cfs,unsigned cf_idx,instr_cf_alloc_t * alloc,instr_cf_exec_t * exec)334 write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx,
335           instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
336 {
337    assert(exec->count);
338 
339    if (alloc)
340       cfs[cf_idx++].alloc = *alloc;
341 
342    /* for memory alloc offset for patching */
343    if (alloc && alloc->buffer_select == SQ_MEMORY &&
344        ctx->info->mem_export_ptr == -1)
345       ctx->info->mem_export_ptr = cf_idx / 2 * 3;
346 
347    cfs[cf_idx++].exec = *exec;
348    exec->address += exec->count;
349    exec->serialize = 0;
350    exec->count = 0;
351 
352    return cf_idx;
353 }
354 
355 /* assemble the final shader */
356 void
assemble(struct ir2_context * ctx,bool binning)357 assemble(struct ir2_context *ctx, bool binning)
358 {
359    /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
360     * address is 9 bits so could it be 512 ?
361     */
362    instr_cf_t cfs[384];
363    instr_t bytecode[384], bc;
364    unsigned block_addr[128];
365    unsigned num_cf = 0;
366 
367    /* CF instr state */
368    instr_cf_exec_t exec = {.opc = EXEC};
369    instr_cf_alloc_t alloc = {.opc = ALLOC};
370 
371    int sync_id, sync_id_prev = -1;
372    bool is_fetch = false;
373    bool need_sync = true;
374    bool need_alloc = false;
375    unsigned block_idx = 0;
376 
377    ctx->info->mem_export_ptr = -1;
378    ctx->info->num_fetch_instrs = 0;
379 
380    /* vertex shader always needs to allocate at least one parameter
381     * if it will never happen,
382     */
383    if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
384       alloc.buffer_select = SQ_PARAMETER_PIXEL;
385       cfs[num_cf++].alloc = alloc;
386    }
387 
388    block_addr[0] = 0;
389 
390    for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
391       struct ir2_instr *instr = ctx->instr_sched[j].instr;
392 
393       /* catch IR2_CF since it isn't a regular instruction */
394       if (instr && instr->type == IR2_CF) {
395          assert(!need_alloc); /* XXX */
396 
397          /* flush any exec cf before inserting jmp */
398          if (exec.count)
399             num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
400 
401          cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){
402             .opc = COND_JMP,
403             .address = instr->cf.block_idx, /* will be fixed later */
404             .force_call = !instr->pred,
405             .predicated_jmp = 1,
406             .direction = instr->cf.block_idx > instr->block_idx,
407             .condition = instr->pred & 1,
408          };
409          continue;
410       }
411 
412       /* fill the 3 dwords for the instruction */
413       fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
414 
415       /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
416       sync_id = 0;
417       if (is_fetch)
418          sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
419 
420       need_sync = sync_id != sync_id_prev;
421       sync_id_prev = sync_id;
422 
423       unsigned block;
424       {
425 
426          if (ctx->instr_sched[j].instr)
427             block = ctx->instr_sched[j].instr->block_idx;
428          else
429             block = ctx->instr_sched[j].instr_s->block_idx;
430 
431          assert(block_idx <= block);
432       }
433 
434       /* info for patching */
435       if (is_fetch) {
436          struct ir2_fetch_info *info =
437             &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
438          info->offset = i * 3; /* add cf offset later */
439 
440          if (bc.fetch.opc == VTX_FETCH) {
441             info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
442          } else if (bc.fetch.opc == TEX_FETCH) {
443             info->tex.samp_id = instr->fetch.tex.samp_id;
444             info->tex.src_swiz = bc.fetch.tex.src_swiz;
445          } else {
446             ctx->info->num_fetch_instrs--;
447          }
448       }
449 
450       /* exec cf after 6 instr or when switching between fetch / alu */
451       if (exec.count == 6 ||
452           (exec.count && (need_sync || block != block_idx))) {
453          num_cf =
454             write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
455          need_alloc = false;
456       }
457 
458       /* update block_addrs for jmp patching */
459       while (block_idx < block)
460          block_addr[++block_idx] = num_cf;
461 
462       /* export - fill alloc cf */
463       if (!is_fetch && bc.alu.export_data) {
464          /* get the export buffer from either vector/scalar dest */
465          instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest);
466          if (bc.alu.scalar_write_mask) {
467             if (bc.alu.vector_write_mask)
468                assert(buffer == export_buf(bc.alu.scalar_dest));
469             buffer = export_buf(bc.alu.scalar_dest);
470          }
471 
472          /* flush previous alloc if the buffer changes */
473          bool need_new_alloc = buffer != alloc.buffer_select;
474 
475          /* memory export always in 32/33 pair, new alloc on 32 */
476          if (bc.alu.vector_dest == 32)
477             need_new_alloc = true;
478 
479          if (need_new_alloc && exec.count) {
480             num_cf =
481                write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
482             need_alloc = false;
483          }
484 
485          need_alloc |= need_new_alloc;
486 
487          alloc.size = 0;
488          alloc.buffer_select = buffer;
489 
490          if (buffer == SQ_PARAMETER_PIXEL &&
491              ctx->so->type == MESA_SHADER_VERTEX)
492             alloc.size = ctx->f->inputs_count - 1;
493 
494          if (buffer == SQ_POSITION)
495             alloc.size = ctx->so->writes_psize;
496       }
497 
498       if (is_fetch)
499          exec.serialize |= 0x1 << exec.count * 2;
500       if (need_sync)
501          exec.serialize |= 0x2 << exec.count * 2;
502 
503       need_sync = false;
504       exec.count += 1;
505       bytecode[i++] = bc;
506    }
507 
508    /* final exec cf */
509    exec.opc = EXEC_END;
510    num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
511 
512    /* insert nop to get an even # of CFs */
513    if (num_cf % 2)
514       cfs[num_cf++] = (instr_cf_t){.opc = NOP};
515 
516    /* patch cf addrs */
517    for (int idx = 0; idx < num_cf; idx++) {
518       switch (cfs[idx].opc) {
519       case NOP:
520       case ALLOC:
521          break;
522       case EXEC:
523       case EXEC_END:
524          cfs[idx].exec.address += num_cf / 2;
525          break;
526       case COND_JMP:
527          cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
528          break;
529       default:
530          assert(0);
531       }
532    }
533 
534    /* concatenate cfs and alu/fetch */
535    uint32_t cfdwords = num_cf / 2 * 3;
536    uint32_t alufetchdwords = exec.address * 3;
537    uint32_t sizedwords = cfdwords + alufetchdwords;
538    uint32_t *dwords = malloc(sizedwords * 4);
539    assert(dwords);
540    memcpy(dwords, cfs, cfdwords * 4);
541    memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
542 
543    /* finalize ir2_shader_info */
544    ctx->info->dwords = dwords;
545    ctx->info->sizedwords = sizedwords;
546    for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
547       ctx->info->fetch_info[i].offset += cfdwords;
548 
549    if (FD_DBG(DISASM)) {
550       DBG("disassemble: type=%d", ctx->so->type);
551       disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
552    }
553 }
554