1 #include "sfn_vertexstageexport.h"
2 
3 #include "sfn_shaderio.h"
4 
5 namespace r600 {
6 
7 using std::priority_queue;
8 
VertexStageExportBase(VertexStage & proc)9 VertexStageExportBase::VertexStageExportBase(VertexStage& proc):
10    m_proc(proc),
11    m_cur_clip_pos(1)
12 {
13 
14 }
15 
~VertexStageExportBase()16 VertexStageExportBase::~VertexStageExportBase()
17 {
18 
19 }
20 
do_process_outputs(nir_variable * output)21 bool VertexStageExportBase::do_process_outputs(nir_variable *output)
22 {
23    return true;
24 }
25 
emit_shader_start()26 void VertexStageExportBase::emit_shader_start()
27 {
28 
29 }
30 
scan_store_output(nir_intrinsic_instr * instr)31 void VertexStageExportBase::scan_store_output(nir_intrinsic_instr* instr)
32 {
33 
34 }
35 
store_output(nir_intrinsic_instr * instr)36 bool VertexStageExportBase::store_output(nir_intrinsic_instr* instr)
37 {
38    auto index = nir_src_as_const_value(instr->src[1]);
39    assert(index && "Indirect outputs not supported");
40 
41    const store_loc store_info  = {
42       nir_intrinsic_component(instr),
43       nir_intrinsic_io_semantics(instr).location,
44       (unsigned)nir_intrinsic_base(instr) + index->u32,
45       0
46    };
47 
48    return do_store_output(store_info, instr);
49 }
50 
VertexStageExportForFS(VertexStage & proc,const pipe_stream_output_info * so_info,r600_pipe_shader * pipe_shader,const r600_shader_key & key)51 VertexStageExportForFS::VertexStageExportForFS(VertexStage& proc,
52                                                const pipe_stream_output_info *so_info,
53                                                r600_pipe_shader *pipe_shader, const r600_shader_key &key):
54    VertexStageWithOutputInfo(proc),
55    m_last_param_export(nullptr),
56    m_last_pos_export(nullptr),
57    m_num_clip_dist(0),
58    m_enabled_stream_buffers_mask(0),
59    m_so_info(so_info),
60    m_pipe_shader(pipe_shader),
61    m_key(key)
62 {
63 }
64 
do_process_outputs(nir_variable * output)65 bool VertexStageWithOutputInfo::do_process_outputs(nir_variable *output)
66 {
67    if (output->data.location == VARYING_SLOT_COL0 ||
68        output->data.location == VARYING_SLOT_COL1 ||
69        (output->data.location >= VARYING_SLOT_VAR0 &&
70        output->data.location <= VARYING_SLOT_VAR31) ||
71        (output->data.location >= VARYING_SLOT_TEX0 &&
72         output->data.location <= VARYING_SLOT_TEX7) ||
73        output->data.location == VARYING_SLOT_BFC0 ||
74        output->data.location == VARYING_SLOT_BFC1 ||
75        output->data.location == VARYING_SLOT_CLIP_VERTEX ||
76        output->data.location == VARYING_SLOT_CLIP_DIST0 ||
77        output->data.location == VARYING_SLOT_CLIP_DIST1 ||
78        output->data.location == VARYING_SLOT_POS ||
79        output->data.location == VARYING_SLOT_PSIZ ||
80        output->data.location == VARYING_SLOT_FOGC ||
81        output->data.location == VARYING_SLOT_LAYER ||
82        output->data.location == VARYING_SLOT_EDGE ||
83        output->data.location == VARYING_SLOT_VIEWPORT
84        ) {
85 
86       r600_shader_io& io = m_proc.sh_info().output[output->data.driver_location];
87       auto semantic = r600_get_varying_semantic(output->data.location);
88       io.name = semantic.first;
89       io.sid = semantic.second;
90 
91       m_proc.evaluate_spi_sid(io);
92       io.write_mask = ((1 << glsl_get_components(output->type)) - 1)
93                       << output->data.location_frac;
94       ++m_proc.sh_info().noutput;
95 
96       if (output->data.location == VARYING_SLOT_PSIZ ||
97           output->data.location == VARYING_SLOT_EDGE ||
98           output->data.location == VARYING_SLOT_LAYER) // VIEWPORT?
99             m_cur_clip_pos = 2;
100 
101       if (output->data.location != VARYING_SLOT_POS &&
102           output->data.location != VARYING_SLOT_EDGE &&
103           output->data.location != VARYING_SLOT_PSIZ &&
104           output->data.location != VARYING_SLOT_CLIP_VERTEX)
105          m_param_driver_locations.push(output->data.driver_location);
106 
107       return true;
108    }
109    return false;
110 }
111 
do_store_output(const store_loc & store_info,nir_intrinsic_instr * instr)112 bool VertexStageExportForFS::do_store_output(const store_loc& store_info, nir_intrinsic_instr* instr)
113 {
114    switch (store_info.location) {
115    case VARYING_SLOT_PSIZ:
116       m_proc.sh_info().vs_out_point_size = 1;
117       m_proc.sh_info().vs_out_misc_write = 1;
118       FALLTHROUGH;
119    case VARYING_SLOT_POS:
120       return emit_varying_pos(store_info, instr);
121    case VARYING_SLOT_EDGE: {
122       std::array<uint32_t, 4> swizzle_override = {7 ,0, 7, 7};
123       return emit_varying_pos(store_info, instr, &swizzle_override);
124    }
125    case VARYING_SLOT_VIEWPORT: {
126       std::array<uint32_t, 4> swizzle_override = {7, 7, 7, 0};
127       return emit_varying_pos(store_info, instr, &swizzle_override) &&
128             emit_varying_param(store_info, instr);
129    }
130    case VARYING_SLOT_CLIP_VERTEX:
131       return emit_clip_vertices(store_info, instr);
132    case VARYING_SLOT_CLIP_DIST0:
133    case VARYING_SLOT_CLIP_DIST1:
134       m_num_clip_dist += 4;
135       return emit_varying_param(store_info, instr) && emit_varying_pos(store_info, instr);
136    case VARYING_SLOT_LAYER: {
137       m_proc.sh_info().vs_out_misc_write = 1;
138       m_proc.sh_info().vs_out_layer = 1;
139       std::array<uint32_t, 4> swz = {7,7,0,7};
140       return emit_varying_pos(store_info, instr, &swz) &&
141             emit_varying_param(store_info, instr);
142    }
143    case VARYING_SLOT_VIEW_INDEX:
144       return emit_varying_pos(store_info, instr) &&
145             emit_varying_param(store_info, instr);
146 
147    default:
148          return emit_varying_param(store_info, instr);
149    }
150 
151    fprintf(stderr, "r600-NIR: Unimplemented store_deref for %d\n",
152            store_info.location);
153    return false;
154 }
155 
emit_varying_pos(const store_loc & store_info,nir_intrinsic_instr * instr,std::array<uint32_t,4> * swizzle_override)156 bool VertexStageExportForFS::emit_varying_pos(const store_loc &store_info, nir_intrinsic_instr* instr,
157                                               std::array<uint32_t, 4> *swizzle_override)
158 {
159    std::array<uint32_t,4> swizzle;
160    uint32_t write_mask = 0;
161 
162    if (swizzle_override) {
163       swizzle = *swizzle_override;
164       for (int i = 0; i < 4; ++i) {
165          if (swizzle[i] < 6)
166             write_mask |= 1 << i;
167       }
168    } else {
169       write_mask = nir_intrinsic_write_mask(instr) << store_info.frac;
170       for (int i = 0; i < 4; ++i)
171          swizzle[i] = ((1 << i) & write_mask) ? i - store_info.frac : 7;
172    }
173 
174    m_proc.sh_info().output[store_info.driver_location].write_mask = write_mask;
175 
176    GPRVector value = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], write_mask, swizzle);
177    m_proc.set_output(store_info.driver_location, value.sel());
178 
179    int export_slot = 0;
180 
181    switch (store_info.location) {
182    case VARYING_SLOT_EDGE: {
183       m_proc.sh_info().vs_out_misc_write = 1;
184       m_proc.sh_info().vs_out_edgeflag = 1;
185       m_proc.emit_instruction(op1_mov, value.reg_i(1), {value.reg_i(1)}, {alu_write, alu_dst_clamp, alu_last_instr});
186       m_proc.emit_instruction(op1_flt_to_int, value.reg_i(1), {value.reg_i(1)}, {alu_write, alu_last_instr});
187       m_proc.sh_info().output[store_info.driver_location].write_mask = 0xf;
188    }
189       FALLTHROUGH;
190    case VARYING_SLOT_PSIZ:
191    case VARYING_SLOT_LAYER:
192       export_slot = 1;
193       break;
194    case VARYING_SLOT_VIEWPORT:
195       m_proc.sh_info().vs_out_misc_write = 1;
196       m_proc.sh_info().vs_out_viewport = 1;
197       export_slot = 1;
198       break;
199    case VARYING_SLOT_POS:
200       break;
201    case VARYING_SLOT_CLIP_DIST0:
202    case VARYING_SLOT_CLIP_DIST1:
203       export_slot = m_cur_clip_pos++;
204       break;
205    default:
206       sfn_log << SfnLog::err << __func__ << "Unsupported location "
207               << store_info.location << "\n";
208       return false;
209    }
210 
211    m_last_pos_export = new ExportInstruction(export_slot, value, ExportInstruction::et_pos);
212    m_proc.emit_export_instruction(m_last_pos_export);
213    m_proc.add_param_output_reg(store_info.driver_location, m_last_pos_export->gpr_ptr());
214    return true;
215 }
216 
emit_varying_param(const store_loc & store_info,nir_intrinsic_instr * instr)217 bool VertexStageExportForFS::emit_varying_param(const store_loc &store_info, nir_intrinsic_instr* instr)
218 {
219    assert(store_info.driver_location < m_proc.sh_info().noutput);
220    sfn_log << SfnLog::io << __func__ << ": emit DDL: " << store_info.driver_location << "\n";
221 
222    int write_mask = nir_intrinsic_write_mask(instr) << store_info.frac;
223    std::array<uint32_t,4> swizzle;
224    for (int i = 0; i < 4; ++i)
225       swizzle[i] = ((1 << i) & write_mask) ? i - store_info.frac : 7;
226 
227    //m_proc.sh_info().output[store_info.driver_location].write_mask = write_mask;
228 
229    GPRVector value = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], write_mask, swizzle, true);
230    m_proc.sh_info().output[store_info.driver_location].gpr = value.sel();
231 
232    /* This should use the registers!! */
233    m_proc.set_output(store_info.driver_location, value.sel());
234 
235    m_last_param_export = new ExportInstruction(param_id(store_info.driver_location),
236                                                value, ExportInstruction::et_param);
237    m_proc.emit_export_instruction(m_last_param_export);
238    m_proc.add_param_output_reg(store_info.driver_location, m_last_param_export->gpr_ptr());
239    return true;
240 }
241 
emit_clip_vertices(const store_loc & store_info,nir_intrinsic_instr * instr)242 bool VertexStageExportForFS::emit_clip_vertices(const store_loc &store_info, nir_intrinsic_instr* instr)
243 {
244    m_proc.sh_info().cc_dist_mask = 0xff;
245    m_proc.sh_info().clip_dist_write = 0xff;
246 
247    m_clip_vertex = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], 0xf, {0,1,2,3});
248    m_proc.add_param_output_reg(store_info.driver_location, &m_clip_vertex);
249 
250    for (int i = 0; i < 4; ++i)
251       m_proc.sh_info().output[store_info.driver_location].write_mask |= 1 << i;
252 
253    GPRVector clip_dist[2] = { m_proc.get_temp_vec4(), m_proc.get_temp_vec4()};
254 
255    for (int i = 0; i < 8; i++) {
256       int oreg = i >> 2;
257       int ochan = i & 3;
258       AluInstruction *ir = nullptr;
259       for (int j = 0; j < 4; j++) {
260          ir = new AluInstruction(op2_dot4_ieee, clip_dist[oreg].reg_i(j), m_clip_vertex.reg_i(j),
261                                  PValue(new UniformValue(512 + i, j, R600_BUFFER_INFO_CONST_BUFFER)),
262                                  (j == ochan) ? EmitInstruction::write : EmitInstruction::empty);
263          m_proc.emit_instruction(ir);
264       }
265       ir->set_flag(alu_last_instr);
266    }
267 
268    m_last_pos_export = new ExportInstruction(m_cur_clip_pos++, clip_dist[0], ExportInstruction::et_pos);
269    m_proc.emit_export_instruction(m_last_pos_export);
270 
271    m_last_pos_export = new ExportInstruction(m_cur_clip_pos, clip_dist[1], ExportInstruction::et_pos);
272    m_proc.emit_export_instruction(m_last_pos_export);
273 
274    return true;
275 }
276 
VertexStageWithOutputInfo(VertexStage & proc)277 VertexStageWithOutputInfo::VertexStageWithOutputInfo(VertexStage& proc):
278    VertexStageExportBase(proc),
279    m_current_param(0)
280 {
281 
282 }
283 
scan_store_output(nir_intrinsic_instr * instr)284 void VertexStageWithOutputInfo::scan_store_output(nir_intrinsic_instr* instr)
285 {
286    auto location = nir_intrinsic_io_semantics(instr).location;
287    auto driver_location = nir_intrinsic_base(instr);
288    auto index = nir_src_as_const_value(instr->src[1]);
289    assert(index);
290 
291    unsigned noutputs = driver_location + index->u32 + 1;
292    if (m_proc.sh_info().noutput < noutputs)
293       m_proc.sh_info().noutput = noutputs;
294 
295    r600_shader_io& io = m_proc.sh_info().output[driver_location + index->u32];
296    auto semantic = r600_get_varying_semantic(location + index->u32);
297    io.name = semantic.first;
298    io.sid = semantic.second;
299    m_proc.evaluate_spi_sid(io);
300    io.write_mask = nir_intrinsic_write_mask(instr);
301 
302    if (location == VARYING_SLOT_PSIZ ||
303        location == VARYING_SLOT_EDGE ||
304        location == VARYING_SLOT_LAYER) // VIEWPORT?
305       m_cur_clip_pos = 2;
306 
307    if (location != VARYING_SLOT_POS &&
308        location != VARYING_SLOT_EDGE &&
309        location != VARYING_SLOT_PSIZ &&
310        location != VARYING_SLOT_CLIP_VERTEX) {
311       m_param_driver_locations.push(driver_location + index->u32);
312    }
313 }
314 
param_id(unsigned driver_location)315 unsigned VertexStageWithOutputInfo::param_id(unsigned driver_location)
316 {
317    auto param_loc = m_param_map.find(driver_location);
318    assert(param_loc != m_param_map.end());
319    return param_loc->second;
320 }
321 
emit_shader_start()322 void VertexStageWithOutputInfo::emit_shader_start()
323 {
324    while (!m_param_driver_locations.empty()) {
325       auto loc = m_param_driver_locations.top();
326       m_param_driver_locations.pop();
327       m_param_map[loc] = m_current_param++;
328    }
329 }
330 
current_param() const331 unsigned VertexStageWithOutputInfo::current_param() const
332 {
333    return m_current_param;
334 }
335 
finalize_exports()336 void VertexStageExportForFS::finalize_exports()
337 {
338    if (m_key.vs.as_gs_a) {
339       PValue o(new GPRValue(0,PIPE_SWIZZLE_0));
340       GPRVector primid({m_proc.primitive_id(), o,o,o});
341       m_last_param_export = new ExportInstruction(current_param(), primid, ExportInstruction::et_param);
342       m_proc.emit_export_instruction(m_last_param_export);
343       int i;
344       i = m_proc.sh_info().noutput++;
345       auto& io = m_proc.sh_info().output[i];
346       io.name = TGSI_SEMANTIC_PRIMID;
347       io.sid = 0;
348       io.gpr = 0;
349       io.interpolate = TGSI_INTERPOLATE_CONSTANT;
350       io.write_mask = 0x1;
351       io.spi_sid = m_key.vs.prim_id_out;
352       m_proc.sh_info().vs_as_gs_a = 1;
353    }
354 
355    if (m_so_info && m_so_info->num_outputs)
356       emit_stream(-1);
357 
358    m_pipe_shader->enabled_stream_buffers_mask = m_enabled_stream_buffers_mask;
359 
360    if (!m_last_param_export) {
361       GPRVector value(0,{7,7,7,7});
362       m_last_param_export = new ExportInstruction(0, value, ExportInstruction::et_param);
363       m_proc.emit_export_instruction(m_last_param_export);
364    }
365    m_last_param_export->set_last();
366 
367    if (!m_last_pos_export) {
368       GPRVector value(0,{7,7,7,7});
369       m_last_pos_export = new ExportInstruction(0, value, ExportInstruction::et_pos);
370       m_proc.emit_export_instruction(m_last_pos_export);
371    }
372    m_last_pos_export->set_last();
373 }
374 
emit_stream(int stream)375 bool VertexStageExportForFS::emit_stream(int stream)
376 {
377    assert(m_so_info);
378    if (m_so_info->num_outputs > PIPE_MAX_SO_OUTPUTS) {
379            R600_ERR("Too many stream outputs: %d\n", m_so_info->num_outputs);
380            return false;
381    }
382    for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
383            if (m_so_info->output[i].output_buffer >= 4) {
384                    R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
385                             m_so_info->output[i].output_buffer);
386                    return false;
387            }
388    }
389    const GPRVector *so_gpr[PIPE_MAX_SHADER_OUTPUTS];
390    unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
391    std::vector<GPRVector> tmp(m_so_info->num_outputs);
392 
393    /* Initialize locations where the outputs are stored. */
394    for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
395       if (stream != -1 && stream != m_so_info->output[i].stream)
396          continue;
397 
398       sfn_log << SfnLog::instr << "Emit stream " << i
399               << " with register index " << m_so_info->output[i].register_index << "  so_gpr:";
400 
401 
402       so_gpr[i] = m_proc.output_register(m_so_info->output[i].register_index);
403 
404       if (!so_gpr[i]) {
405          sfn_log << SfnLog::err << "\nERR: register index "
406                  << m_so_info->output[i].register_index
407                  << " doesn't correspond to an output register\n";
408          return false;
409       }
410       start_comp[i] = m_so_info->output[i].start_component;
411       /* Lower outputs with dst_offset < start_component.
412        *
413        * We can only output 4D vectors with a write mask, e.g. we can
414        * only output the W component at offset 3, etc. If we want
415        * to store Y, Z, or W at buffer offset 0, we need to use MOV
416        * to move it to X and output X. */
417       if (m_so_info->output[i].dst_offset < m_so_info->output[i].start_component) {
418 
419          GPRVector::Swizzle swizzle =  {0,1,2,3};
420          for (auto j = m_so_info->output[i].num_components; j < 4; ++j)
421             swizzle[j] = 7;
422          tmp[i] = m_proc.get_temp_vec4(swizzle);
423 
424          int sc = m_so_info->output[i].start_component;
425          AluInstruction *alu = nullptr;
426          for (int j = 0; j < m_so_info->output[i].num_components; j++) {
427             alu = new AluInstruction(op1_mov, tmp[i][j], so_gpr[i]->reg_i(j + sc), {alu_write});
428             m_proc.emit_instruction(alu);
429          }
430          if (alu)
431             alu->set_flag(alu_last_instr);
432 
433          start_comp[i] = 0;
434          so_gpr[i] = &tmp[i];
435       }
436       sfn_log << SfnLog::instr <<  *so_gpr[i] << "\n";
437    }
438 
439    /* Write outputs to buffers. */
440    for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
441       sfn_log << SfnLog::instr << "Write output buffer " << i
442               << " with register index " << m_so_info->output[i].register_index << "\n";
443 
444       StreamOutIntruction *out_stream =
445             new StreamOutIntruction(*so_gpr[i],
446                                     m_so_info->output[i].num_components,
447                                     m_so_info->output[i].dst_offset - start_comp[i],
448                                     ((1 << m_so_info->output[i].num_components) - 1) << start_comp[i],
449                                     m_so_info->output[i].output_buffer,
450                                     m_so_info->output[i].stream);
451       m_proc.emit_export_instruction(out_stream);
452       m_enabled_stream_buffers_mask |= (1 << m_so_info->output[i].output_buffer) << m_so_info->output[i].stream * 4;
453    }
454    return true;
455 }
456 
457 
VertexStageExportForGS(VertexStage & proc,const r600_shader * gs_shader)458 VertexStageExportForGS::VertexStageExportForGS(VertexStage &proc,
459                                                const r600_shader *gs_shader):
460    VertexStageWithOutputInfo(proc),
461    m_num_clip_dist(0),
462    m_gs_shader(gs_shader)
463 {
464 
465 }
466 
do_store_output(const store_loc & store_info,nir_intrinsic_instr * instr)467 bool VertexStageExportForGS::do_store_output(const store_loc& store_info, nir_intrinsic_instr* instr)
468 {
469    int ring_offset = -1;
470    const r600_shader_io& out_io = m_proc.sh_info().output[store_info.driver_location];
471 
472    sfn_log << SfnLog::io << "check output " << store_info.driver_location
473            << " name=" << out_io.name<< " sid=" << out_io.sid << "\n";
474    for (unsigned k = 0; k < m_gs_shader->ninput; ++k) {
475       auto& in_io = m_gs_shader->input[k];
476       sfn_log << SfnLog::io << "  against  " <<  k << " name=" << in_io.name<< " sid=" << in_io.sid << "\n";
477 
478       if (in_io.name == out_io.name &&
479           in_io.sid == out_io.sid) {
480          ring_offset = in_io.ring_offset;
481          break;
482       }
483    }
484 
485    if (store_info.location == VARYING_SLOT_VIEWPORT) {
486       m_proc.sh_info().vs_out_viewport = 1;
487       m_proc.sh_info().vs_out_misc_write = 1;
488       return true;
489    }
490 
491    if (ring_offset == -1) {
492       sfn_log << SfnLog::err << "VS defines output at "
493               << store_info.driver_location << "name=" << out_io.name
494               << " sid=" << out_io.sid << " that is not consumed as GS input\n";
495       return true;
496    }
497 
498    uint32_t write_mask =  (1 << instr->num_components) - 1;
499 
500    GPRVector value = m_proc.vec_from_nir_with_fetch_constant(instr->src[store_info.data_loc], write_mask,
501          swizzle_from_comps(instr->num_components), true);
502 
503    auto ir = new MemRingOutIntruction(cf_mem_ring, mem_write, value,
504                                       ring_offset >> 2, 4, PValue());
505    m_proc.emit_export_instruction(ir);
506 
507    m_proc.sh_info().output[store_info.driver_location].write_mask |= write_mask;
508    if (store_info.location == VARYING_SLOT_CLIP_DIST0 ||
509        store_info.location == VARYING_SLOT_CLIP_DIST1)
510       m_num_clip_dist += 4;
511 
512    return true;
513 }
514 
finalize_exports()515 void VertexStageExportForGS::finalize_exports()
516 {
517 
518 }
519 
VertexStageExportForES(VertexStage & proc)520 VertexStageExportForES::VertexStageExportForES(VertexStage& proc):
521    VertexStageExportBase(proc)
522 {
523 }
524 
do_store_output(const store_loc & store_info,nir_intrinsic_instr * instr)525 bool VertexStageExportForES::do_store_output(const store_loc& store_info, nir_intrinsic_instr* instr)
526 {
527    return true;
528 }
529 
finalize_exports()530 void VertexStageExportForES::finalize_exports()
531 {
532 
533 }
534 
535 }
536