1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_nir.h"
25 #include "compiler/nir/nir_builder.h"
26 #include "util/u_math.h"
27 #include "util/bitscan.h"
28 
29 static nir_ssa_def *
dup_mem_intrinsic(nir_builder * b,nir_intrinsic_instr * intrin,nir_ssa_def * store_src,int offset,unsigned num_components,unsigned bit_size,unsigned align)30 dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
31                   nir_ssa_def *store_src, int offset,
32                   unsigned num_components, unsigned bit_size,
33                   unsigned align)
34 {
35    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
36 
37    nir_intrinsic_instr *dup =
38       nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
39 
40    nir_src *intrin_offset_src = nir_get_io_offset_src(intrin);
41    for (unsigned i = 0; i < info->num_srcs; i++) {
42       assert(intrin->src[i].is_ssa);
43       if (i == 0 && store_src) {
44          assert(!info->has_dest);
45          assert(&intrin->src[i] != intrin_offset_src);
46          dup->src[i] = nir_src_for_ssa(store_src);
47       } else if (&intrin->src[i] == intrin_offset_src) {
48          dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa,
49                                                        offset));
50       } else {
51          dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa);
52       }
53    }
54 
55    dup->num_components = num_components;
56    if (intrin->intrinsic == nir_intrinsic_load_scratch ||
57        intrin->intrinsic == nir_intrinsic_store_scratch)
58       assert(num_components == 1);
59 
60    for (unsigned i = 0; i < info->num_indices; i++)
61       dup->const_index[i] = intrin->const_index[i];
62 
63    if (nir_intrinsic_has_access(intrin))
64       nir_intrinsic_set_access(dup, nir_intrinsic_access(intrin));
65 
66    nir_intrinsic_set_align(dup, align, 0);
67 
68    if (info->has_dest) {
69       assert(intrin->dest.is_ssa);
70       nir_ssa_dest_init(&dup->instr, &dup->dest,
71                         num_components, bit_size, NULL);
72    } else {
73       nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1);
74    }
75 
76    nir_builder_instr_insert(b, &dup->instr);
77 
78    return info->has_dest ? &dup->dest.ssa : NULL;
79 }
80 
81 static bool
lower_mem_load_bit_size(nir_builder * b,nir_intrinsic_instr * intrin,const struct intel_device_info * devinfo)82 lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
83                         const struct intel_device_info *devinfo)
84 {
85    const bool needs_scalar =
86       intrin->intrinsic == nir_intrinsic_load_scratch;
87 
88    assert(intrin->dest.is_ssa);
89    const unsigned bit_size = intrin->dest.ssa.bit_size;
90    const unsigned num_components = intrin->dest.ssa.num_components;
91    const unsigned bytes_read = num_components * (bit_size / 8);
92    const unsigned align = nir_intrinsic_align(intrin);
93 
94    if (bit_size == 32 && align >= 32 && intrin->num_components <= 4 &&
95        (!needs_scalar || intrin->num_components == 1))
96       return false;
97 
98    nir_ssa_def *result;
99    nir_src *offset_src = nir_get_io_offset_src(intrin);
100    if (bit_size < 32 && !needs_scalar && nir_src_is_const(*offset_src)) {
101       /* The offset is constant so we can use a 32-bit load and just shift it
102        * around as needed.
103        */
104       const int load_offset = nir_src_as_uint(*offset_src) % 4;
105       assert(load_offset % (bit_size / 8) == 0);
106       const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, 4);
107       /* A 16-bit vec4 is a 32-bit vec2.  We add an extra component in case
108        * we offset into a component with load_offset.
109        */
110       assert(load_comps32 <= 3);
111 
112       nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset,
113                                             load_comps32, 32, 4);
114       result = nir_extract_bits(b, &load, 1, load_offset * 8,
115                                 num_components, bit_size);
116    } else {
117       /* Otherwise, we have to break it into smaller loads.  We could end up
118        * with as many as 32 loads if we're loading a u64vec16 from scratch.
119        */
120       nir_ssa_def *loads[32];
121       unsigned num_loads = 0;
122       int load_offset = 0;
123       while (load_offset < bytes_read) {
124          const unsigned bytes_left = bytes_read - load_offset;
125          unsigned load_bit_size, load_comps;
126          if (align < 4) {
127             load_comps = 1;
128             /* Choose a byte, word, or dword */
129             load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8;
130          } else {
131             assert(load_offset % 4 == 0);
132             load_bit_size = 32;
133             load_comps = needs_scalar ? 1 :
134                          DIV_ROUND_UP(MIN2(bytes_left, 16), 4);
135          }
136 
137          loads[num_loads++] = dup_mem_intrinsic(b, intrin, NULL, load_offset,
138                                                 load_comps, load_bit_size,
139                                                 align);
140 
141          load_offset += load_comps * (load_bit_size / 8);
142       }
143       assert(num_loads <= ARRAY_SIZE(loads));
144       result = nir_extract_bits(b, loads, num_loads, 0,
145                                 num_components, bit_size);
146    }
147 
148    nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
149                             result);
150    nir_instr_remove(&intrin->instr);
151 
152    return true;
153 }
154 
155 static bool
lower_mem_store_bit_size(nir_builder * b,nir_intrinsic_instr * intrin,const struct intel_device_info * devinfo)156 lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin,
157                          const struct intel_device_info *devinfo)
158 {
159    const bool needs_scalar =
160       intrin->intrinsic == nir_intrinsic_store_scratch;
161 
162    assert(intrin->src[0].is_ssa);
163    nir_ssa_def *value = intrin->src[0].ssa;
164 
165    assert(intrin->num_components == value->num_components);
166    const unsigned bit_size = value->bit_size;
167    const unsigned num_components = intrin->num_components;
168    const unsigned bytes_written = num_components * (bit_size / 8);
169    const unsigned align_mul = nir_intrinsic_align_mul(intrin);
170    const unsigned align_offset = nir_intrinsic_align_offset(intrin);
171    const unsigned align = nir_intrinsic_align(intrin);
172 
173    nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin);
174    assert(writemask < (1 << num_components));
175 
176    if ((value->bit_size <= 32 && num_components == 1) ||
177        (value->bit_size == 32 && num_components <= 4 && align >= 32 &&
178         writemask == (1 << num_components) - 1 &&
179         !needs_scalar))
180       return false;
181 
182    nir_src *offset_src = nir_get_io_offset_src(intrin);
183    const bool offset_is_const = nir_src_is_const(*offset_src);
184    const unsigned const_offset =
185       offset_is_const ? nir_src_as_uint(*offset_src) : 0;
186 
187    const unsigned byte_size = bit_size / 8;
188    assert(byte_size <= sizeof(uint64_t));
189 
190    BITSET_DECLARE(mask, NIR_MAX_VEC_COMPONENTS * sizeof(uint64_t));
191    BITSET_ZERO(mask);
192 
193    for (unsigned i = 0; i < num_components; i++) {
194       if (writemask & (1u << i))
195          BITSET_SET_RANGE_INSIDE_WORD(mask, i * byte_size, ((i + 1) * byte_size) - 1);
196    }
197 
198    while (BITSET_FFS(mask) != 0) {
199       const int start = BITSET_FFS(mask) - 1;
200 
201       int end;
202       for (end = start + 1; end < bytes_written; end++) {
203          if (!(BITSET_TEST(mask, end)))
204             break;
205       }
206       /* The size of the current contiguous chunk in bytes */
207       const unsigned chunk_bytes = end - start;
208 
209       const bool is_dword_aligned =
210          (align_mul >= 4 && (align_offset + start) % 4 == 0) ||
211          (offset_is_const && (start + const_offset) % 4 == 0);
212 
213       unsigned store_comps, store_bit_size, store_align;
214       if (chunk_bytes >= 4 && is_dword_aligned) {
215          store_align = MAX2(align, 4);
216          store_bit_size = 32;
217          store_comps = needs_scalar ? 1 : MIN2(chunk_bytes, 16) / 4;
218       } else {
219          store_align = align;
220          store_comps = 1;
221          store_bit_size = MIN2(chunk_bytes, 4) * 8;
222          /* The bit size must be a power of two */
223          if (store_bit_size == 24)
224             store_bit_size = 16;
225       }
226       const unsigned store_bytes = store_comps * (store_bit_size / 8);
227 
228       nir_ssa_def *packed = nir_extract_bits(b, &value, 1, start * 8,
229                                              store_comps, store_bit_size);
230 
231       dup_mem_intrinsic(b, intrin, packed, start,
232                         store_comps, store_bit_size, store_align);
233 
234       BITSET_CLEAR_RANGE(mask, start, (start + store_bytes - 1));
235    }
236 
237    nir_instr_remove(&intrin->instr);
238 
239    return true;
240 }
241 
242 static bool
lower_mem_access_bit_sizes_instr(nir_builder * b,nir_instr * instr,void * cb_data)243 lower_mem_access_bit_sizes_instr(nir_builder *b,
244                                 nir_instr *instr,
245                                 void *cb_data)
246 {
247    const struct intel_device_info *devinfo = cb_data;
248 
249    if (instr->type != nir_instr_type_intrinsic)
250       return false;
251 
252    b->cursor = nir_after_instr(instr);
253 
254    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
255    switch (intrin->intrinsic) {
256    case nir_intrinsic_load_global:
257    case nir_intrinsic_load_global_constant:
258    case nir_intrinsic_load_ssbo:
259    case nir_intrinsic_load_shared:
260    case nir_intrinsic_load_scratch:
261       return lower_mem_load_bit_size(b, intrin, devinfo);
262 
263    case nir_intrinsic_store_global:
264    case nir_intrinsic_store_ssbo:
265    case nir_intrinsic_store_shared:
266    case nir_intrinsic_store_scratch:
267       return lower_mem_store_bit_size(b, intrin, devinfo);
268 
269    default:
270       return false;
271    }
272 }
273 
274 /**
275  * This pass loads arbitrary SSBO and shared memory load/store operations to
276  * intrinsics which are natively handleable by GEN hardware.  In particular,
277  * we have two general types of memory load/store messages:
278  *
279  *  - Untyped surface read/write:  These can load/store between one and four
280  *    dword components to/from a dword-aligned offset.
281  *
282  *  - Byte scattered read/write:  These can load/store a single byte, word, or
283  *    dword scalar to/from an unaligned byte offset.
284  *
285  * Neither type of message can do a write-masked store.  This pass converts
286  * all nir load/store intrinsics into a series of either 8 or 32-bit
287  * load/store intrinsics with a number of components that we can directly
288  * handle in hardware and with a trivial write-mask.
289  *
290  * For scratch access, additional consideration has to be made due to the way
291  * that we swizzle the memory addresses to achieve decent cache locality.  In
292  * particular, even though untyped surface read/write messages exist and work,
293  * we can't use them to load multiple components in a single SEND.  For more
294  * detail on the scratch swizzle, see fs_visitor::swizzle_nir_scratch_addr.
295  */
296 bool
brw_nir_lower_mem_access_bit_sizes(nir_shader * shader,const struct intel_device_info * devinfo)297 brw_nir_lower_mem_access_bit_sizes(nir_shader *shader,
298                                    const struct intel_device_info *devinfo)
299 {
300    return nir_shader_instructions_pass(shader, lower_mem_access_bit_sizes_instr,
301                                        nir_metadata_block_index |
302                                        nir_metadata_dominance,
303                                        (void *)devinfo);
304 }
305