1 /* -*- mesa-c++  -*-
2  *
3  * Copyright (c) 2020 Collabora LTD
4  *
5  * Author: Gert Wollny <gert.wollny@collabora.com>
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * on the rights to use, copy, modify, merge, publish, distribute, sub
11  * license, and/or sell copies of the Software, and to permit persons to whom
12  * the Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "sfn_nir.h"
28 
29 #include "nir.h"
30 #include "nir_builder.h"
31 
32 #include <map>
33 #include <vector>
34 #include <iostream>
35 
36 namespace r600 {
37 
38 using std::map;
39 using std::pair;
40 using std::make_pair;
41 using std::vector;
42 
43 class LowerSplit64BitVar : public NirLowerInstruction {
44 public:
45 
46    ~LowerSplit64BitVar();
47    using VarSplit = pair<nir_variable*, nir_variable*>;
48    using VarMap = map<unsigned, VarSplit>;
49 
50    nir_ssa_def *
51    split_double_load_deref(nir_intrinsic_instr *intr);
52 
53    nir_ssa_def *
54    split_double_store_deref(nir_intrinsic_instr *intr);
55 
56 private:
57    nir_ssa_def *
58    split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index);
59 
60    nir_ssa_def *
61    split_load_deref_var(nir_intrinsic_instr *intr);
62 
63    nir_ssa_def *
64    split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref);
65 
66    nir_ssa_def *
67    split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref1);
68 
69    VarSplit get_var_pair(nir_variable *old_var);
70 
71    nir_ssa_def *
72    merge_64bit_loads(nir_ssa_def *load1, nir_ssa_def *load2, bool out_is_vec3);
73 
74    nir_ssa_def *split_double_load(nir_intrinsic_instr *load1);
75 
76    nir_ssa_def *
77    split_store_output(nir_intrinsic_instr *store1);
78 
79    nir_ssa_def *split_double_load_uniform(nir_intrinsic_instr *intr);
80 
81    nir_ssa_def *
82    split_double_load_ssbo(nir_intrinsic_instr *intr);
83 
84    nir_ssa_def *
85    split_double_load_ubo(nir_intrinsic_instr *intr);
86 
87    nir_ssa_def *
88    split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction);
89 
90    nir_ssa_def *
91    split_reduction3(nir_alu_instr *alu,
92                     nir_op op1, nir_op op2, nir_op reduction);
93 
94    nir_ssa_def *
95    split_reduction4(nir_alu_instr *alu,
96                     nir_op op1, nir_op op2, nir_op reduction);
97 
98    nir_ssa_def *split_bcsel(nir_alu_instr *alu);
99 
100    nir_ssa_def *split_load_const(nir_load_const_instr *lc);
101 
102    bool filter(const nir_instr *instr) const override;
103    nir_ssa_def *lower(nir_instr *instr) override;
104 
105    VarMap m_varmap;
106    vector<nir_variable*> m_old_vars;
107    vector<nir_instr *> m_old_stores;
108 };
109 
110 
111 bool
filter(const nir_instr * instr) const112 LowerSplit64BitVar::filter(const nir_instr *instr) const
113 {
114    switch (instr->type) {
115    case  nir_instr_type_intrinsic: {
116       auto intr = nir_instr_as_intrinsic(instr);
117 
118       switch (intr->intrinsic) {
119       case nir_intrinsic_load_deref:
120       case nir_intrinsic_load_uniform:
121       case nir_intrinsic_load_input:
122       case nir_intrinsic_load_ubo:
123       case nir_intrinsic_load_ssbo:
124          if (nir_dest_bit_size(intr->dest) != 64)
125             return false;
126          return nir_dest_num_components(intr->dest) >= 3;
127       case nir_intrinsic_store_output:
128          if (nir_src_bit_size(intr->src[0]) != 64)
129             return false;
130          return nir_src_num_components(intr->src[0]) >= 3;
131       case nir_intrinsic_store_deref:
132          if (nir_src_bit_size(intr->src[1]) != 64)
133             return false;
134          return nir_src_num_components(intr->src[1]) >= 3;
135       default:
136          return false;
137       }
138    }
139    case  nir_instr_type_alu: {
140       auto alu = nir_instr_as_alu(instr);
141       switch (alu->op) {
142       case nir_op_bcsel:
143          if (nir_dest_num_components(alu->dest.dest) < 3)
144             return false;
145          return nir_dest_bit_size(alu->dest.dest) == 64;
146       case nir_op_bany_fnequal3:
147       case nir_op_bany_fnequal4:
148       case nir_op_ball_fequal3:
149       case nir_op_ball_fequal4:
150       case nir_op_bany_inequal3:
151       case nir_op_bany_inequal4:
152       case nir_op_ball_iequal3:
153       case nir_op_ball_iequal4:
154       case nir_op_fdot3:
155       case nir_op_fdot4:
156          return nir_src_bit_size(alu->src[1].src) == 64;
157       default:
158          return false;
159       }
160    }
161    case nir_instr_type_load_const: {
162       auto lc = nir_instr_as_load_const(instr);
163       if (lc->def.bit_size != 64)
164          return false;
165       return lc->def.num_components >= 3;
166    }
167    default:
168       return false;
169    }
170 }
171 
172 nir_ssa_def *
merge_64bit_loads(nir_ssa_def * load1,nir_ssa_def * load2,bool out_is_vec3)173 LowerSplit64BitVar::merge_64bit_loads(nir_ssa_def *load1,
174                                       nir_ssa_def *load2, bool out_is_vec3)
175 {
176    if (out_is_vec3)
177       return nir_vec3(b, nir_channel(b, load1, 0),
178                       nir_channel(b, load1, 1),
179                       nir_channel(b, load2, 0));
180    else
181       return nir_vec4(b, nir_channel(b, load1, 0),
182                       nir_channel(b, load1, 1),
183                       nir_channel(b, load2, 0),
184                       nir_channel(b, load2, 1));
185 }
186 
~LowerSplit64BitVar()187 LowerSplit64BitVar::~LowerSplit64BitVar()
188 {
189    for(auto&& v: m_old_vars)
190       exec_node_remove(&v->node);
191 
192    for(auto&& v: m_old_stores)
193       nir_instr_remove(v);
194 }
195 
196 nir_ssa_def *
split_double_store_deref(nir_intrinsic_instr * intr)197 LowerSplit64BitVar::split_double_store_deref(nir_intrinsic_instr *intr)
198 {
199    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
200    if (deref->deref_type == nir_deref_type_var)
201       return split_store_deref_var(intr, deref);
202    else if (deref->deref_type == nir_deref_type_array)
203       return split_store_deref_array(intr, deref);
204    else {
205       unreachable("only splitting of stores to vars and arrays is supported");
206    }
207 }
208 
209 nir_ssa_def *
split_double_load_deref(nir_intrinsic_instr * intr)210 LowerSplit64BitVar::split_double_load_deref(nir_intrinsic_instr *intr)
211 {
212    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
213    if (deref->deref_type == nir_deref_type_var)
214       return split_load_deref_var(intr);
215    else if (deref->deref_type == nir_deref_type_array)
216       return split_load_deref_array(intr, deref->arr.index);
217    else {
218       unreachable(0 && "only splitting of loads from vars and arrays is supported");
219    }
220    m_old_stores.push_back(&intr->instr);
221 }
222 
223 nir_ssa_def *
split_load_deref_array(nir_intrinsic_instr * intr,nir_src & index)224 LowerSplit64BitVar::split_load_deref_array(nir_intrinsic_instr *intr, nir_src& index)
225 {
226    auto old_var = nir_intrinsic_get_var(intr, 0);
227    unsigned old_components = old_var->type->without_array()->components();
228 
229    assert(old_components > 2 && old_components <= 4);
230 
231    auto vars = get_var_pair(old_var);
232 
233    auto deref1 = nir_build_deref_var(b, vars.first);
234    auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, index, 1));
235    auto load1 = nir_build_load_deref(b, 2, 64, &deref_array1->dest.ssa, (enum gl_access_qualifier)0);
236 
237    auto deref2 = nir_build_deref_var(b, vars.second);
238    auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, index, 1));
239 
240    auto load2 = nir_build_load_deref(b, old_components - 2, 64, &deref_array2->dest.ssa, (enum gl_access_qualifier)0);
241 
242    return merge_64bit_loads(load1, load2, old_components == 3);
243 }
244 
245 nir_ssa_def *
split_store_deref_array(nir_intrinsic_instr * intr,nir_deref_instr * deref)246 LowerSplit64BitVar::split_store_deref_array(nir_intrinsic_instr *intr, nir_deref_instr *deref)
247 {
248    auto old_var = nir_intrinsic_get_var(intr, 0);
249    unsigned old_components = old_var->type->without_array()->components();
250 
251    assert(old_components > 2 && old_components <= 4);
252 
253    auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
254 
255    auto vars = get_var_pair(old_var);
256 
257    auto deref1 = nir_build_deref_var(b, vars.first);
258    auto deref_array1 = nir_build_deref_array(b, deref1, nir_ssa_for_src(b, deref->arr.index, 1));
259 
260    nir_build_store_deref(b, &deref_array1->dest.ssa, src_xy, 3);
261 
262    auto deref2 = nir_build_deref_var(b, vars.second);
263    auto deref_array2 = nir_build_deref_array(b, deref2, nir_ssa_for_src(b, deref->arr.index, 1));
264 
265    if (old_components == 3)
266       nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
267    else
268       nir_build_store_deref(b, &deref_array2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
269 
270    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
271 }
272 
273 nir_ssa_def *
split_store_deref_var(nir_intrinsic_instr * intr,nir_deref_instr * deref)274 LowerSplit64BitVar::split_store_deref_var(nir_intrinsic_instr *intr, nir_deref_instr *deref)
275 {
276    auto old_var = nir_intrinsic_get_var(intr, 0);
277    unsigned old_components = old_var->type->without_array()->components();
278 
279    assert(old_components > 2 && old_components <= 4);
280 
281    auto src_xy = nir_channels(b, intr->src[1].ssa, 3);
282 
283    auto vars = get_var_pair(old_var);
284 
285    auto deref1 = nir_build_deref_var(b, vars.first);
286    nir_build_store_deref(b, &deref1->dest.ssa, src_xy, 3);
287 
288    auto deref2 = nir_build_deref_var(b, vars.second);
289    if (old_components == 3)
290       nir_build_store_deref(b, &deref2->dest.ssa, nir_channel(b, intr->src[1].ssa, 2), 1);
291    else
292       nir_build_store_deref(b, &deref2->dest.ssa, nir_channels(b, intr->src[1].ssa, 0xc), 3);
293 
294    return NIR_LOWER_INSTR_PROGRESS_REPLACE;
295 }
296 
297 nir_ssa_def *
split_load_deref_var(nir_intrinsic_instr * intr)298 LowerSplit64BitVar::split_load_deref_var(nir_intrinsic_instr *intr)
299 {
300    auto old_var = nir_intrinsic_get_var(intr, 0);
301    auto vars = get_var_pair(old_var);
302    unsigned old_components = old_var->type->components();
303 
304    nir_deref_instr *deref1 = nir_build_deref_var(b, vars.first);
305    auto *load1 = nir_load_deref(b, deref1);
306 
307    nir_deref_instr *deref2 = nir_build_deref_var(b, vars.second);
308    deref2->type = vars.second->type;
309 
310    auto *load2 = nir_load_deref(b, deref2);
311 
312    return merge_64bit_loads(load1, load2, old_components == 3);
313 }
314 
315 LowerSplit64BitVar::VarSplit
get_var_pair(nir_variable * old_var)316 LowerSplit64BitVar::get_var_pair(nir_variable *old_var)
317 {
318    auto split_vars = m_varmap.find(old_var->data.driver_location);
319 
320    assert(old_var->type->without_array()->components() > 2);
321 
322    if (split_vars == m_varmap.end()) {
323       auto var1 = nir_variable_clone(old_var, b->shader);
324       auto var2 = nir_variable_clone(old_var, b->shader);
325 
326       var1->type = glsl_dvec_type(2);
327       var2->type = glsl_dvec_type(old_var->type->without_array()->components() - 2);
328 
329       if (old_var->type->is_array()) {
330          var1->type = glsl_array_type(var1->type, old_var->type->array_size(), 0);
331          var2->type = glsl_array_type(var2->type, old_var->type->array_size(), 0);
332       }
333 
334       if (old_var->data.mode == nir_var_shader_in ||
335           old_var->data.mode == nir_var_shader_out) {
336          ++var2->data.driver_location;
337          ++var2->data.location;
338          nir_shader_add_variable(b->shader, var1);
339          nir_shader_add_variable(b->shader, var2);
340       } else if (old_var->data.mode == nir_var_function_temp) {
341          exec_list_push_tail(&b->impl->locals, &var1->node);
342          exec_list_push_tail(&b->impl->locals, &var2->node);
343       }
344 
345       m_varmap[old_var->data.driver_location] = make_pair(var1, var2);
346    }
347    return m_varmap[old_var->data.driver_location];
348 }
349 
350 
351 nir_ssa_def *
split_double_load(nir_intrinsic_instr * load1)352 LowerSplit64BitVar::split_double_load(nir_intrinsic_instr *load1)
353 {
354    unsigned old_components = nir_dest_num_components(load1->dest);
355    auto load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &load1->instr));
356    nir_io_semantics sem = nir_intrinsic_io_semantics(load1);
357 
358    load1->dest.ssa.num_components = 2;
359    sem.num_slots = 1;
360    nir_intrinsic_set_io_semantics(load1, sem);
361 
362    load2->dest.ssa.num_components = old_components - 2;
363    sem.location += 1;
364    nir_intrinsic_set_io_semantics(load2, sem);
365    nir_intrinsic_set_base(load2, nir_intrinsic_base(load1) + 1);
366    nir_builder_instr_insert(b, &load2->instr);
367 
368    return merge_64bit_loads(&load1->dest.ssa, &load2->dest.ssa, old_components == 3);
369 }
370 
371 
372 nir_ssa_def *
split_store_output(nir_intrinsic_instr * store1)373 LowerSplit64BitVar::split_store_output(nir_intrinsic_instr *store1)
374 {
375    auto src = store1->src[0];
376    unsigned old_components = nir_src_num_components(src);
377    nir_io_semantics sem = nir_intrinsic_io_semantics(store1);
378 
379    auto store2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &store1->instr));
380    auto src1 = nir_channels(b, src.ssa, 3);
381    auto src2 = nir_channels(b, src.ssa, old_components == 3 ? 4 : 0xc);
382 
383    nir_instr_rewrite_src(&store1->instr, &src, nir_src_for_ssa(src1));
384    nir_intrinsic_set_write_mask(store1, 3);
385 
386    nir_instr_rewrite_src(&store2->instr, &src, nir_src_for_ssa(src2));
387    nir_intrinsic_set_write_mask(store2, old_components == 3 ? 1 : 3);
388 
389    sem.num_slots = 1;
390    nir_intrinsic_set_io_semantics(store1, sem);
391 
392    sem.location += 1;
393    nir_intrinsic_set_io_semantics(store2, sem);
394    nir_intrinsic_set_base(store2, nir_intrinsic_base(store1));
395 
396    nir_builder_instr_insert(b, &store2->instr);
397    return NIR_LOWER_INSTR_PROGRESS;
398 }
399 
400 
401 nir_ssa_def *
split_double_load_uniform(nir_intrinsic_instr * intr)402 LowerSplit64BitVar::split_double_load_uniform(nir_intrinsic_instr *intr)
403 {
404    unsigned second_components = nir_dest_num_components(intr->dest) - 2;
405    nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
406    load2->src[0] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
407    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
408    nir_intrinsic_set_base(load2, nir_intrinsic_base(intr));
409    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
410    load2->num_components = second_components;
411 
412    nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
413    nir_builder_instr_insert(b, &load2->instr);
414 
415    intr->dest.ssa.num_components = intr->num_components = 2;
416 
417    if (second_components == 1)
418       return nir_vec3(b, nir_channel(b, &intr->dest.ssa, 0),
419                       nir_channel(b, &intr->dest.ssa, 1),
420                       nir_channel(b, &load2->dest.ssa, 0));
421    else
422       return nir_vec4(b, nir_channel(b, &intr->dest.ssa, 0),
423                       nir_channel(b, &intr->dest.ssa, 1),
424                       nir_channel(b, &load2->dest.ssa, 0),
425                       nir_channel(b, &load2->dest.ssa, 1));
426 }
427 
428 nir_ssa_def *
split_double_load_ssbo(nir_intrinsic_instr * intr)429 LowerSplit64BitVar::split_double_load_ssbo(nir_intrinsic_instr *intr)
430 {
431    unsigned second_components = nir_dest_num_components(intr->dest) - 2;
432    nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
433 
434    auto new_src0 = nir_src_for_ssa(nir_iadd_imm(b, intr->src[0].ssa, 1));
435    nir_instr_rewrite_src(&load2->instr, &load2->src[0], new_src0);
436    load2->num_components = second_components;
437    nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
438 
439    nir_intrinsic_set_dest_type(load2, nir_intrinsic_dest_type(intr));
440    nir_builder_instr_insert(b, &load2->instr);
441 
442    intr->dest.ssa.num_components = intr->num_components = 2;
443 
444    return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
445 }
446 
447 
448 nir_ssa_def *
split_double_load_ubo(nir_intrinsic_instr * intr)449 LowerSplit64BitVar::split_double_load_ubo(nir_intrinsic_instr *intr)
450 {
451    unsigned second_components = nir_dest_num_components(intr->dest) - 2;
452    nir_intrinsic_instr *load2 = nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr));
453    load2->src[0] = intr->src[0];
454    load2->src[1] = nir_src_for_ssa(nir_iadd_imm(b, intr->src[1].ssa, 16));
455    nir_intrinsic_set_range_base(load2, nir_intrinsic_range_base(intr) + 16);
456    nir_intrinsic_set_range(load2, nir_intrinsic_range(intr));
457    nir_intrinsic_set_access(load2, nir_intrinsic_access(intr));
458    nir_intrinsic_set_align_mul(load2, nir_intrinsic_align_mul(intr));
459    nir_intrinsic_set_align_offset(load2, nir_intrinsic_align_offset(intr) + 16);
460 
461    load2->num_components = second_components;
462 
463    nir_ssa_dest_init(&load2->instr, &load2->dest, second_components, 64, nullptr);
464    nir_builder_instr_insert(b, &load2->instr);
465 
466    intr->dest.ssa.num_components = intr->num_components = 2;
467 
468    return merge_64bit_loads(&intr->dest.ssa, &load2->dest.ssa, second_components == 1);
469 }
470 
471 nir_ssa_def *
split_reduction(nir_ssa_def * src[2][2],nir_op op1,nir_op op2,nir_op reduction)472 LowerSplit64BitVar::split_reduction(nir_ssa_def *src[2][2], nir_op op1, nir_op op2, nir_op reduction)
473 {
474    auto cmp0 = nir_build_alu(b, op1, src[0][0], src[0][1], nullptr, nullptr);
475    auto cmp1 = nir_build_alu(b, op2, src[1][0], src[1][1], nullptr, nullptr);
476    return nir_build_alu(b, reduction, cmp0, cmp1, nullptr, nullptr);
477 }
478 
479 nir_ssa_def *
split_reduction3(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)480 LowerSplit64BitVar::split_reduction3(nir_alu_instr *alu,
481                                      nir_op op1, nir_op op2, nir_op reduction)
482 {
483    nir_ssa_def *src[2][2];
484 
485    src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
486    src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
487 
488    src[1][0]  = nir_channel(b, nir_ssa_for_src(b, alu->src[0].src, 3), 2);
489    src[1][1]  = nir_channel(b, nir_ssa_for_src(b, alu->src[1].src, 3), 2);
490 
491    return split_reduction(src, op1, op2, reduction);
492 }
493 
494 nir_ssa_def *
split_reduction4(nir_alu_instr * alu,nir_op op1,nir_op op2,nir_op reduction)495 LowerSplit64BitVar::split_reduction4(nir_alu_instr *alu,
496                                      nir_op op1, nir_op op2, nir_op reduction)
497 {
498    nir_ssa_def *src[2][2];
499 
500    src[0][0] = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 2), 3);
501    src[0][1] = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 2), 3);
502 
503    src[1][0]  = nir_channels(b, nir_ssa_for_src(b, alu->src[0].src, 4), 0xc);
504    src[1][1]  = nir_channels(b, nir_ssa_for_src(b, alu->src[1].src, 4), 0xc);
505 
506    return split_reduction(src, op1, op2, reduction);
507 }
508 
509 nir_ssa_def *
split_bcsel(nir_alu_instr * alu)510 LowerSplit64BitVar::split_bcsel(nir_alu_instr *alu)
511 {
512    static nir_ssa_def *dest[4];
513    for (unsigned i = 0; i < nir_dest_num_components(alu->dest.dest); ++i) {
514       dest[i] = nir_bcsel(b,
515                           nir_channel(b, alu->src[0].src.ssa, i),
516                           nir_channel(b, alu->src[1].src.ssa, i),
517                           nir_channel(b, alu->src[2].src.ssa, i));
518    }
519    return nir_vec(b, dest, nir_dest_num_components(alu->dest.dest));
520 }
521 
522 nir_ssa_def *
split_load_const(nir_load_const_instr * lc)523 LowerSplit64BitVar::split_load_const(nir_load_const_instr *lc)
524 {
525    nir_ssa_def *ir[4];
526    for (unsigned i = 0; i < lc->def.num_components; ++i)
527       ir[i] = nir_imm_double(b, lc->value[i].f64);
528 
529    return nir_vec(b, ir, lc->def.num_components);
530 }
531 
532 nir_ssa_def *
lower(nir_instr * instr)533 LowerSplit64BitVar::lower(nir_instr *instr)
534 {
535    switch (instr->type) {
536    case nir_instr_type_intrinsic: {
537       auto intr = nir_instr_as_intrinsic(instr);
538       switch (intr->intrinsic) {
539       case nir_intrinsic_load_deref:
540          return this->split_double_load_deref(intr);
541       case nir_intrinsic_load_uniform:
542          return split_double_load_uniform(intr);
543       case nir_intrinsic_load_ubo:
544          return split_double_load_ubo(intr);
545       case nir_intrinsic_load_ssbo:
546          return split_double_load_ssbo(intr);
547       case nir_intrinsic_load_input:
548          return split_double_load(intr);
549       case nir_intrinsic_store_output:
550          return split_store_output(intr);
551       case nir_intrinsic_store_deref:
552          return split_double_store_deref(intr);
553       default:
554          assert(0);
555       }
556    }
557    case  nir_instr_type_alu: {
558       auto alu = nir_instr_as_alu(instr);
559       nir_print_instr(instr, stderr);
560       fprintf(stderr, "\n");
561       switch (alu->op) {
562       case nir_op_bany_fnequal3:
563          return split_reduction3(alu, nir_op_bany_fnequal2, nir_op_fneu, nir_op_ior);
564       case nir_op_ball_fequal3:
565          return split_reduction3(alu, nir_op_ball_fequal2, nir_op_feq, nir_op_iand);
566       case nir_op_bany_inequal3:
567          return split_reduction3(alu, nir_op_bany_inequal2, nir_op_ine, nir_op_ior);
568       case nir_op_ball_iequal3:
569          return split_reduction3(alu, nir_op_ball_iequal2, nir_op_ieq, nir_op_iand);
570       case nir_op_fdot3:
571          return split_reduction3(alu, nir_op_fdot2, nir_op_fmul, nir_op_fadd);
572       case nir_op_bany_fnequal4:
573          return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
574       case nir_op_ball_fequal4:
575          return split_reduction4(alu, nir_op_ball_fequal2, nir_op_ball_fequal2, nir_op_iand);
576       case nir_op_bany_inequal4:
577          return split_reduction4(alu, nir_op_bany_inequal2, nir_op_bany_inequal2, nir_op_ior);
578       case nir_op_ball_iequal4:
579          return split_reduction4(alu, nir_op_bany_fnequal2, nir_op_bany_fnequal2, nir_op_ior);
580       case nir_op_fdot4:
581          return split_reduction4(alu, nir_op_fdot2, nir_op_fdot2, nir_op_fadd);
582       case nir_op_bcsel:
583          return split_bcsel(alu);
584       default:
585          assert(0);
586       }
587    }
588    case nir_instr_type_load_const: {
589       auto lc = nir_instr_as_load_const(instr);
590       return split_load_const(lc);
591    }
592    default:
593       assert(0);
594    }
595    return nullptr;
596 }
597 
598 /* Split 64 bit instruction so that at most two 64 bit components are
599  * used in one instruction */
600 
601 bool
r600_nir_split_64bit_io(nir_shader * sh)602 r600_nir_split_64bit_io(nir_shader *sh)
603 {
604    return LowerSplit64BitVar().run(sh);
605 }
606 
607 /* */
608 class Lower64BitToVec2 : public NirLowerInstruction {
609 
610 private:
611    bool filter(const nir_instr *instr) const override;
612    nir_ssa_def *lower(nir_instr *instr) override;
613 
614    nir_ssa_def *load_deref_64_to_vec2(nir_intrinsic_instr *intr);
615    nir_ssa_def *load_uniform_64_to_vec2(nir_intrinsic_instr *intr);
616    nir_ssa_def *load_ssbo_64_to_vec2(nir_intrinsic_instr *intr);
617    nir_ssa_def *load_64_to_vec2(nir_intrinsic_instr *intr);
618    nir_ssa_def *store_64_to_vec2(nir_intrinsic_instr *intr);
619 };
620 
621 bool
filter(const nir_instr * instr) const622 Lower64BitToVec2::filter(const nir_instr *instr) const
623 {
624    switch (instr->type) {
625    case nir_instr_type_intrinsic:  {
626       auto intr = nir_instr_as_intrinsic(instr);
627 
628       switch (intr->intrinsic) {
629       case nir_intrinsic_load_deref:
630       case nir_intrinsic_load_input:
631       case nir_intrinsic_load_uniform:
632       case nir_intrinsic_load_ubo:
633       case nir_intrinsic_load_ubo_vec4:
634       case nir_intrinsic_load_ssbo:
635          return nir_dest_bit_size(intr->dest) == 64;
636       case nir_intrinsic_store_deref: {
637          if (nir_src_bit_size(intr->src[1]) == 64)
638             return true;
639          auto var = nir_intrinsic_get_var(intr, 0);
640          if (var->type->without_array()->bit_size() == 64)
641             return true;
642          return (var->type->without_array()->components() != intr->num_components);
643       }
644       default:
645          return false;
646       }
647    }
648    case nir_instr_type_alu: {
649       auto alu = nir_instr_as_alu(instr);
650       return nir_dest_bit_size(alu->dest.dest) == 64;
651    }
652    case nir_instr_type_phi: {
653       auto phi = nir_instr_as_phi(instr);
654       return nir_dest_bit_size(phi->dest) == 64;
655    }
656    case nir_instr_type_load_const:  {
657       auto lc = nir_instr_as_load_const(instr);
658       return lc->def.bit_size == 64;
659    }
660    case nir_instr_type_ssa_undef:  {
661       auto undef = nir_instr_as_ssa_undef(instr);
662       return undef->def.bit_size == 64;
663    }
664    default:
665       return false;
666    }
667 }
668 
669 nir_ssa_def *
lower(nir_instr * instr)670 Lower64BitToVec2::lower(nir_instr *instr)
671 {
672    switch (instr->type) {
673    case nir_instr_type_intrinsic:  {
674       auto intr = nir_instr_as_intrinsic(instr);
675       switch (intr->intrinsic) {
676       case nir_intrinsic_load_deref:
677          return load_deref_64_to_vec2(intr);
678       case nir_intrinsic_load_uniform:
679          return load_uniform_64_to_vec2(intr);
680       case nir_intrinsic_load_ssbo:
681          return load_ssbo_64_to_vec2(intr);
682       case nir_intrinsic_load_input:
683       case nir_intrinsic_load_ubo:
684       case nir_intrinsic_load_ubo_vec4:
685          return load_64_to_vec2(intr);
686       case nir_intrinsic_store_deref:
687          return store_64_to_vec2(intr);
688       default:
689 
690          return nullptr;
691       }
692    }
693    case nir_instr_type_alu: {
694       auto alu = nir_instr_as_alu(instr);
695       alu->dest.dest.ssa.bit_size = 32;
696       alu->dest.dest.ssa.num_components *= 2;
697       alu->dest.write_mask = (1 << alu->dest.dest.ssa.num_components) - 1;
698       switch (alu->op) {
699       case nir_op_pack_64_2x32_split:
700          alu->op = nir_op_vec2;
701          break;
702       case nir_op_pack_64_2x32:
703          alu->op = nir_op_mov;
704          break;
705       case nir_op_vec2:
706          return nir_vec4(b,
707                          nir_channel(b, alu->src[0].src.ssa, 0),
708                          nir_channel(b, alu->src[0].src.ssa, 1),
709                          nir_channel(b, alu->src[1].src.ssa, 0),
710                          nir_channel(b, alu->src[1].src.ssa, 1));
711       default:
712          return NULL;
713       }
714       return NIR_LOWER_INSTR_PROGRESS;
715    }
716    case nir_instr_type_phi: {
717       auto phi = nir_instr_as_phi(instr);
718       phi->dest.ssa.bit_size = 32;
719       phi->dest.ssa.num_components = 2;
720       return NIR_LOWER_INSTR_PROGRESS;
721    }
722    case nir_instr_type_load_const:  {
723       auto lc = nir_instr_as_load_const(instr);
724       assert(lc->def.num_components < 3);
725       nir_const_value val[4] = {0};
726       for (uint i = 0; i < lc->def.num_components; ++i) {
727          uint64_t v = lc->value[i].u64;
728          val[0].u32 = v & 0xffffffff;
729          val[1].u32 = (v >> 32) & 0xffffffff;
730       }
731 
732       return nir_build_imm(b, 2 * lc->def.num_components, 32, val);
733    }
734    case nir_instr_type_ssa_undef:  {
735       auto undef = nir_instr_as_ssa_undef(instr);
736       undef->def.num_components *= 2;
737       undef->def.bit_size = 32;
738       return NIR_LOWER_INSTR_PROGRESS;
739    }
740    default:
741       return nullptr;
742    }
743 
744 }
745 
746 
747 nir_ssa_def *
load_deref_64_to_vec2(nir_intrinsic_instr * intr)748 Lower64BitToVec2::load_deref_64_to_vec2(nir_intrinsic_instr *intr)
749 {
750    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
751    auto var = nir_intrinsic_get_var(intr, 0);
752    unsigned components = var->type->without_array()->components();
753    if (var->type->without_array()->bit_size() == 64) {
754       components *= 2;
755       if (deref->deref_type == nir_deref_type_var) {
756          var->type = glsl_vec_type(components);
757       } else if (deref->deref_type == nir_deref_type_array) {
758 
759          var->type = glsl_array_type(glsl_vec_type(components),
760                                      var->type->array_size(), 0);
761 
762       } else {
763          nir_print_shader(b->shader, stderr);
764          assert(0 && "Only lowring of var and array derefs supported\n");
765       }
766    }
767    deref->type = var->type;
768    if (deref->deref_type == nir_deref_type_array) {
769       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
770       deref_array->type = var->type;
771       deref->type = deref_array->type->without_array();
772    }
773 
774    intr->num_components = components;
775    intr->dest.ssa.bit_size = 32;
776    intr->dest.ssa.num_components = components;
777    return NIR_LOWER_INSTR_PROGRESS;
778 }
779 
780 nir_ssa_def *
store_64_to_vec2(nir_intrinsic_instr * intr)781 Lower64BitToVec2::store_64_to_vec2(nir_intrinsic_instr *intr)
782 {
783    auto deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr);
784    auto var = nir_intrinsic_get_var(intr, 0);
785 
786    unsigned components = var->type->without_array()->components();
787    unsigned wrmask = nir_intrinsic_write_mask(intr);
788    if (var->type->without_array()->bit_size() == 64) {
789       components *= 2;
790       if (deref->deref_type == nir_deref_type_var) {
791          var->type = glsl_vec_type(components);
792       } else if (deref->deref_type == nir_deref_type_array) {
793          var->type = glsl_array_type(glsl_vec_type(components),
794                                      var->type->array_size(), 0);
795       } else {
796             nir_print_shader(b->shader, stderr);
797             assert(0 && "Only lowring of var and array derefs supported\n");
798       }
799    }
800    deref->type = var->type;
801    if (deref->deref_type == nir_deref_type_array) {
802       auto deref_array = nir_instr_as_deref(deref->parent.ssa->parent_instr);
803       deref_array->type = var->type;
804       deref->type = deref_array->type->without_array();
805    }
806    intr->num_components = components;
807    nir_intrinsic_set_write_mask(intr, wrmask == 1 ? 3 : 0xf);
808    return NIR_LOWER_INSTR_PROGRESS;
809 }
810 
811 
812 nir_ssa_def *
load_uniform_64_to_vec2(nir_intrinsic_instr * intr)813 Lower64BitToVec2::load_uniform_64_to_vec2(nir_intrinsic_instr *intr)
814 {
815    intr->num_components *= 2;
816    intr->dest.ssa.bit_size = 32;
817    intr->dest.ssa.num_components *= 2;
818    nir_intrinsic_set_dest_type(intr, nir_type_float32);
819    return NIR_LOWER_INSTR_PROGRESS;
820 }
821 
822 nir_ssa_def *
load_64_to_vec2(nir_intrinsic_instr * intr)823 Lower64BitToVec2::load_64_to_vec2(nir_intrinsic_instr *intr)
824 {
825    intr->num_components *= 2;
826    intr->dest.ssa.bit_size = 32;
827    intr->dest.ssa.num_components *= 2;
828    nir_intrinsic_set_component(intr, nir_intrinsic_component(intr) * 2);
829    return NIR_LOWER_INSTR_PROGRESS;
830 }
831 
832 nir_ssa_def *
load_ssbo_64_to_vec2(nir_intrinsic_instr * intr)833 Lower64BitToVec2::load_ssbo_64_to_vec2(nir_intrinsic_instr *intr)
834 {
835    intr->num_components *= 2;
836    intr->dest.ssa.bit_size = 32;
837    intr->dest.ssa.num_components *= 2;
838    return NIR_LOWER_INSTR_PROGRESS;
839 }
840 
store_64bit_intr(nir_src * src,void * state)841 static bool store_64bit_intr(nir_src *src, void *state)
842 {
843    bool *s = (bool *)state;
844    *s = nir_src_bit_size(*src) == 64;
845    return !*s;
846 }
847 
double2vec2(nir_src * src,void * state)848 static bool double2vec2(nir_src *src, void *state)
849 {
850    if (nir_src_bit_size(*src) != 64)
851       return true;
852 
853    assert(src->is_ssa);
854    src->ssa->bit_size = 32;
855    src->ssa->num_components *= 2;
856    return true;
857 }
858 
859 bool
r600_nir_64_to_vec2(nir_shader * sh)860 r600_nir_64_to_vec2(nir_shader *sh)
861 {
862    vector<nir_instr*> intr64bit;
863    nir_foreach_function(function, sh) {
864       if (function->impl) {
865          nir_builder b;
866          nir_builder_init(&b, function->impl);
867 
868          nir_foreach_block(block, function->impl) {
869             nir_foreach_instr_safe(instr, block) {
870                switch (instr->type) {
871                case nir_instr_type_alu: {
872                   bool success = false;
873                   nir_foreach_src(instr, store_64bit_intr, &success);
874                   if (success)
875                      intr64bit.push_back(instr);
876                   break;
877                }
878                case nir_instr_type_intrinsic: {
879                   auto ir = nir_instr_as_intrinsic(instr);
880                   switch (ir->intrinsic) {
881                   case nir_intrinsic_store_output:
882                   case nir_intrinsic_store_ssbo: {
883                      bool success = false;
884                      nir_foreach_src(instr, store_64bit_intr, &success);
885                      if (success) {
886                         auto wm = nir_intrinsic_write_mask(ir);
887                         nir_intrinsic_set_write_mask(ir, (wm == 1) ? 3 : 0xf);
888                         ir->num_components *= 2;
889                      }
890                      break;
891                   }
892                   default:
893                      ;
894                   }
895                }
896                default:
897                   ;
898                }
899             }
900          }
901       }
902    }
903 
904    bool result = Lower64BitToVec2().run(sh);
905 
906    if (result || !intr64bit.empty()) {
907 
908       for(auto&& instr: intr64bit) {
909          if (instr->type == nir_instr_type_alu) {
910             auto alu = nir_instr_as_alu(instr);
911             auto alu_info = nir_op_infos[alu->op];
912             for (unsigned i = 0; i < alu_info.num_inputs; ++i) {
913                int swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
914                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS / 2; k++) {
915                   if (!nir_alu_instr_channel_used(alu, i, k)) {
916                      continue;
917                   }
918 
919                   switch (alu->op) {
920                   case nir_op_unpack_64_2x32_split_x:
921                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
922                      alu->op = nir_op_mov;
923                      break;
924                   case nir_op_unpack_64_2x32_split_y:
925                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2 + 1;
926                      alu->op = nir_op_mov;
927                      break;
928                   case nir_op_unpack_64_2x32:
929                      alu->op = nir_op_mov;
930                      break;
931                   case nir_op_bcsel:
932                      if (i == 0) {
933                         swizzle[2 * k] = swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2;
934                         break;
935                      }
936                      FALLTHROUGH;
937                   default:
938                      swizzle[2 * k] = alu->src[i].swizzle[k] * 2;
939                      swizzle[2 * k + 1] = alu->src[i].swizzle[k] * 2 + 1;
940                   }
941                }
942                for (unsigned k = 0; k < NIR_MAX_VEC_COMPONENTS; ++k) {
943                   alu->src[i].swizzle[k] = swizzle[k];
944                }
945             }
946          } else
947             nir_foreach_src(instr, double2vec2, nullptr);
948       }
949       result = true;
950    }
951 
952    return result;
953 }
954 
955 using std::map;
956 using std::vector;
957 using std::pair;
958 
959 class StoreMerger {
960 public:
961    StoreMerger(nir_shader *shader);
962    void collect_stores();
963    bool combine();
964    void combine_one_slot(vector<nir_intrinsic_instr*>& stores);
965 
966    using StoreCombos = map<unsigned, vector<nir_intrinsic_instr*>>;
967 
968    StoreCombos m_stores;
969    nir_shader *sh;
970 };
971 
StoreMerger(nir_shader * shader)972 StoreMerger::StoreMerger(nir_shader *shader):
973    sh(shader)
974 {
975 }
976 
977 
collect_stores()978 void StoreMerger::collect_stores()
979 {
980    unsigned vertex = 0;
981    nir_foreach_function(function, sh) {
982       if (function->impl) {
983          nir_foreach_block(block, function->impl) {
984             nir_foreach_instr_safe(instr, block) {
985                if (instr->type != nir_instr_type_intrinsic)
986                   continue;
987 
988                auto ir = nir_instr_as_intrinsic(instr);
989                if (ir->intrinsic == nir_intrinsic_emit_vertex ||
990                    ir->intrinsic == nir_intrinsic_emit_vertex_with_counter) {
991                   ++vertex;
992                   continue;
993                }
994                if (ir->intrinsic != nir_intrinsic_store_output)
995                   continue;
996 
997                unsigned index = nir_intrinsic_base(ir) + 64 * vertex +
998                                 8 * 64 * nir_intrinsic_io_semantics(ir).gs_streams;
999                m_stores[index].push_back(ir);
1000             }
1001          }
1002       }
1003    }
1004 }
1005 
combine()1006 bool StoreMerger::combine()
1007 {
1008    bool progress = false;
1009    for(auto&& i : m_stores) {
1010       if (i.second.size() < 2)
1011          continue;
1012 
1013       combine_one_slot(i.second);
1014       progress = true;
1015    }
1016    return progress;
1017 }
1018 
combine_one_slot(vector<nir_intrinsic_instr * > & stores)1019 void StoreMerger::combine_one_slot(vector<nir_intrinsic_instr*>& stores)
1020 {
1021    nir_ssa_def *srcs[4] = {nullptr};
1022 
1023    nir_builder b;
1024    nir_builder_init(&b, nir_shader_get_entrypoint(sh));
1025    auto last_store = *stores.rbegin();
1026 
1027    b.cursor = nir_before_instr(&last_store->instr);
1028 
1029    unsigned comps = 0;
1030    unsigned writemask = 0;
1031    unsigned first_comp = 4;
1032    for (auto&& store : stores) {
1033       int cmp = nir_intrinsic_component(store);
1034       for (unsigned i = 0; i < nir_src_num_components(store->src[0]); ++i, ++comps) {
1035          unsigned out_comp = i + cmp;
1036          srcs[out_comp] = nir_channel(&b, store->src[0].ssa, i);
1037          writemask |= 1 << out_comp;
1038          if (first_comp > out_comp)
1039             first_comp = out_comp;
1040       }
1041    }
1042 
1043    auto new_src = nir_vec(&b, srcs, comps);
1044 
1045    nir_instr_rewrite_src(&last_store->instr, &last_store->src[0], nir_src_for_ssa(new_src));
1046    last_store->num_components = comps;
1047    nir_intrinsic_set_component(last_store, first_comp);
1048    nir_intrinsic_set_write_mask(last_store, writemask);
1049 
1050    for (auto i = stores.begin(); i != stores.end() - 1; ++i)
1051       nir_instr_remove(&(*i)->instr);
1052 }
1053 
r600_merge_vec2_stores(nir_shader * shader)1054 bool r600_merge_vec2_stores(nir_shader *shader)
1055 {
1056    r600::StoreMerger merger(shader);
1057    merger.collect_stores();
1058    return merger.combine();
1059 }
1060 
1061 } // end namespace r600
1062 
1063 
1064