1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * Although it's called a load/store "vectorization" pass, this also combines
26 * intersecting and identical loads/stores. It currently supports derefs, ubo,
27 * ssbo and push constant loads/stores.
28 *
29 * This doesn't handle copy_deref intrinsics and assumes that
30 * nir_lower_alu_to_scalar() has been called and that the IR is free from ALU
31 * modifiers. It also assumes that derefs have explicitly laid out types.
32 *
33 * After vectorization, the backend may want to call nir_lower_alu_to_scalar()
34 * and nir_lower_pack(). Also this creates cast instructions taking derefs as a
35 * source and some parts of NIR may not be able to handle that well.
36 *
37 * There are a few situations where this doesn't vectorize as well as it could:
38 * - It won't turn four consecutive vec3 loads into 3 vec4 loads.
39 * - It doesn't do global vectorization.
40 * Handling these cases probably wouldn't provide much benefit though.
41 *
42 * This probably doesn't handle big-endian GPUs correctly.
43 */
44
45 #include "nir.h"
46 #include "nir_deref.h"
47 #include "nir_builder.h"
48 #include "nir_worklist.h"
49 #include "util/u_dynarray.h"
50
51 #include <stdlib.h>
52
53 struct intrinsic_info {
54 nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */
55 nir_intrinsic_op op;
56 bool is_atomic;
57 /* Indices into nir_intrinsic::src[] or -1 if not applicable. */
58 int resource_src; /* resource (e.g. from vulkan_resource_index) */
59 int base_src; /* offset which it loads/stores from */
60 int deref_src; /* deref which is loads/stores from */
61 int value_src; /* the data it is storing */
62 };
63
64 static const struct intrinsic_info *
get_info(nir_intrinsic_op op)65 get_info(nir_intrinsic_op op) {
66 switch (op) {
67 #define INFO(mode, op, atomic, res, base, deref, val) \
68 case nir_intrinsic_##op: {\
69 static const struct intrinsic_info op##_info = {mode, nir_intrinsic_##op, atomic, res, base, deref, val};\
70 return &op##_info;\
71 }
72 #define LOAD(mode, op, res, base, deref) INFO(mode, load_##op, false, res, base, deref, -1)
73 #define STORE(mode, op, res, base, deref, val) INFO(mode, store_##op, false, res, base, deref, val)
74 #define ATOMIC(mode, type, op, res, base, deref, val) INFO(mode, type##_atomic_##op, true, res, base, deref, val)
75 LOAD(nir_var_mem_push_const, push_constant, -1, 0, -1)
76 LOAD(nir_var_mem_ubo, ubo, 0, 1, -1)
77 LOAD(nir_var_mem_ssbo, ssbo, 0, 1, -1)
78 STORE(nir_var_mem_ssbo, ssbo, 1, 2, -1, 0)
79 LOAD(0, deref, -1, -1, 0)
80 STORE(0, deref, -1, -1, 0, 1)
81 LOAD(nir_var_mem_shared, shared, -1, 0, -1)
82 STORE(nir_var_mem_shared, shared, -1, 1, -1, 0)
83 LOAD(nir_var_mem_global, global, -1, 0, -1)
84 STORE(nir_var_mem_global, global, -1, 1, -1, 0)
85 ATOMIC(nir_var_mem_ssbo, ssbo, add, 0, 1, -1, 2)
86 ATOMIC(nir_var_mem_ssbo, ssbo, imin, 0, 1, -1, 2)
87 ATOMIC(nir_var_mem_ssbo, ssbo, umin, 0, 1, -1, 2)
88 ATOMIC(nir_var_mem_ssbo, ssbo, imax, 0, 1, -1, 2)
89 ATOMIC(nir_var_mem_ssbo, ssbo, umax, 0, 1, -1, 2)
90 ATOMIC(nir_var_mem_ssbo, ssbo, and, 0, 1, -1, 2)
91 ATOMIC(nir_var_mem_ssbo, ssbo, or, 0, 1, -1, 2)
92 ATOMIC(nir_var_mem_ssbo, ssbo, xor, 0, 1, -1, 2)
93 ATOMIC(nir_var_mem_ssbo, ssbo, exchange, 0, 1, -1, 2)
94 ATOMIC(nir_var_mem_ssbo, ssbo, comp_swap, 0, 1, -1, 2)
95 ATOMIC(nir_var_mem_ssbo, ssbo, fadd, 0, 1, -1, 2)
96 ATOMIC(nir_var_mem_ssbo, ssbo, fmin, 0, 1, -1, 2)
97 ATOMIC(nir_var_mem_ssbo, ssbo, fmax, 0, 1, -1, 2)
98 ATOMIC(nir_var_mem_ssbo, ssbo, fcomp_swap, 0, 1, -1, 2)
99 ATOMIC(0, deref, add, -1, -1, 0, 1)
100 ATOMIC(0, deref, imin, -1, -1, 0, 1)
101 ATOMIC(0, deref, umin, -1, -1, 0, 1)
102 ATOMIC(0, deref, imax, -1, -1, 0, 1)
103 ATOMIC(0, deref, umax, -1, -1, 0, 1)
104 ATOMIC(0, deref, and, -1, -1, 0, 1)
105 ATOMIC(0, deref, or, -1, -1, 0, 1)
106 ATOMIC(0, deref, xor, -1, -1, 0, 1)
107 ATOMIC(0, deref, exchange, -1, -1, 0, 1)
108 ATOMIC(0, deref, comp_swap, -1, -1, 0, 1)
109 ATOMIC(0, deref, fadd, -1, -1, 0, 1)
110 ATOMIC(0, deref, fmin, -1, -1, 0, 1)
111 ATOMIC(0, deref, fmax, -1, -1, 0, 1)
112 ATOMIC(0, deref, fcomp_swap, -1, -1, 0, 1)
113 ATOMIC(nir_var_mem_shared, shared, add, -1, 0, -1, 1)
114 ATOMIC(nir_var_mem_shared, shared, imin, -1, 0, -1, 1)
115 ATOMIC(nir_var_mem_shared, shared, umin, -1, 0, -1, 1)
116 ATOMIC(nir_var_mem_shared, shared, imax, -1, 0, -1, 1)
117 ATOMIC(nir_var_mem_shared, shared, umax, -1, 0, -1, 1)
118 ATOMIC(nir_var_mem_shared, shared, and, -1, 0, -1, 1)
119 ATOMIC(nir_var_mem_shared, shared, or, -1, 0, -1, 1)
120 ATOMIC(nir_var_mem_shared, shared, xor, -1, 0, -1, 1)
121 ATOMIC(nir_var_mem_shared, shared, exchange, -1, 0, -1, 1)
122 ATOMIC(nir_var_mem_shared, shared, comp_swap, -1, 0, -1, 1)
123 ATOMIC(nir_var_mem_shared, shared, fadd, -1, 0, -1, 1)
124 ATOMIC(nir_var_mem_shared, shared, fmin, -1, 0, -1, 1)
125 ATOMIC(nir_var_mem_shared, shared, fmax, -1, 0, -1, 1)
126 ATOMIC(nir_var_mem_shared, shared, fcomp_swap, -1, 0, -1, 1)
127 ATOMIC(nir_var_mem_global, global, add, -1, 0, -1, 1)
128 ATOMIC(nir_var_mem_global, global, imin, -1, 0, -1, 1)
129 ATOMIC(nir_var_mem_global, global, umin, -1, 0, -1, 1)
130 ATOMIC(nir_var_mem_global, global, imax, -1, 0, -1, 1)
131 ATOMIC(nir_var_mem_global, global, umax, -1, 0, -1, 1)
132 ATOMIC(nir_var_mem_global, global, and, -1, 0, -1, 1)
133 ATOMIC(nir_var_mem_global, global, or, -1, 0, -1, 1)
134 ATOMIC(nir_var_mem_global, global, xor, -1, 0, -1, 1)
135 ATOMIC(nir_var_mem_global, global, exchange, -1, 0, -1, 1)
136 ATOMIC(nir_var_mem_global, global, comp_swap, -1, 0, -1, 1)
137 ATOMIC(nir_var_mem_global, global, fadd, -1, 0, -1, 1)
138 ATOMIC(nir_var_mem_global, global, fmin, -1, 0, -1, 1)
139 ATOMIC(nir_var_mem_global, global, fmax, -1, 0, -1, 1)
140 ATOMIC(nir_var_mem_global, global, fcomp_swap, -1, 0, -1, 1)
141 default:
142 break;
143 #undef ATOMIC
144 #undef STORE
145 #undef LOAD
146 #undef INFO
147 }
148 return NULL;
149 }
150
151 /*
152 * Information used to compare memory operations.
153 * It canonically represents an offset as:
154 * `offset_defs[0]*offset_defs_mul[0] + offset_defs[1]*offset_defs_mul[1] + ...`
155 * "offset_defs" is sorted in ascenting order by the ssa definition's index.
156 * "resource" or "var" may be NULL.
157 */
158 struct entry_key {
159 nir_ssa_def *resource;
160 nir_variable *var;
161 unsigned offset_def_count;
162 nir_ssa_scalar *offset_defs;
163 uint64_t *offset_defs_mul;
164 };
165
166 /* Information on a single memory operation. */
167 struct entry {
168 struct list_head head;
169 unsigned index;
170
171 struct entry_key *key;
172 union {
173 uint64_t offset; /* sign-extended */
174 int64_t offset_signed;
175 };
176 uint32_t align_mul;
177 uint32_t align_offset;
178
179 nir_instr *instr;
180 nir_intrinsic_instr *intrin;
181 const struct intrinsic_info *info;
182 enum gl_access_qualifier access;
183 bool is_store;
184
185 nir_deref_instr *deref;
186 };
187
188 struct vectorize_ctx {
189 nir_shader *shader;
190 const nir_load_store_vectorize_options *options;
191 struct list_head entries[nir_num_variable_modes];
192 struct hash_table *loads[nir_num_variable_modes];
193 struct hash_table *stores[nir_num_variable_modes];
194 };
195
hash_entry_key(const void * key_)196 static uint32_t hash_entry_key(const void *key_)
197 {
198 /* this is careful to not include pointers in the hash calculation so that
199 * the order of the hash table walk is deterministic */
200 struct entry_key *key = (struct entry_key*)key_;
201
202 uint32_t hash = 0;
203 if (key->resource)
204 hash = XXH32(&key->resource->index, sizeof(key->resource->index), hash);
205 if (key->var) {
206 hash = XXH32(&key->var->index, sizeof(key->var->index), hash);
207 unsigned mode = key->var->data.mode;
208 hash = XXH32(&mode, sizeof(mode), hash);
209 }
210
211 for (unsigned i = 0; i < key->offset_def_count; i++) {
212 hash = XXH32(&key->offset_defs[i].def->index, sizeof(key->offset_defs[i].def->index), hash);
213 hash = XXH32(&key->offset_defs[i].comp, sizeof(key->offset_defs[i].comp), hash);
214 }
215
216 hash = XXH32(key->offset_defs_mul, key->offset_def_count * sizeof(uint64_t), hash);
217
218 return hash;
219 }
220
entry_key_equals(const void * a_,const void * b_)221 static bool entry_key_equals(const void *a_, const void *b_)
222 {
223 struct entry_key *a = (struct entry_key*)a_;
224 struct entry_key *b = (struct entry_key*)b_;
225
226 if (a->var != b->var || a->resource != b->resource)
227 return false;
228
229 if (a->offset_def_count != b->offset_def_count)
230 return false;
231
232 for (unsigned i = 0; i < a->offset_def_count; i++) {
233 if (a->offset_defs[i].def != b->offset_defs[i].def ||
234 a->offset_defs[i].comp != b->offset_defs[i].comp)
235 return false;
236 }
237
238 size_t offset_def_mul_size = a->offset_def_count * sizeof(uint64_t);
239 if (a->offset_def_count &&
240 memcmp(a->offset_defs_mul, b->offset_defs_mul, offset_def_mul_size))
241 return false;
242
243 return true;
244 }
245
delete_entry_dynarray(struct hash_entry * entry)246 static void delete_entry_dynarray(struct hash_entry *entry)
247 {
248 struct util_dynarray *arr = (struct util_dynarray *)entry->data;
249 ralloc_free(arr);
250 }
251
sort_entries(const void * a_,const void * b_)252 static int sort_entries(const void *a_, const void *b_)
253 {
254 struct entry *a = *(struct entry*const*)a_;
255 struct entry *b = *(struct entry*const*)b_;
256
257 if (a->offset_signed > b->offset_signed)
258 return 1;
259 else if (a->offset_signed < b->offset_signed)
260 return -1;
261 else
262 return 0;
263 }
264
265 static unsigned
get_bit_size(struct entry * entry)266 get_bit_size(struct entry *entry)
267 {
268 unsigned size = entry->is_store ?
269 entry->intrin->src[entry->info->value_src].ssa->bit_size :
270 entry->intrin->dest.ssa.bit_size;
271 return size == 1 ? 32u : size;
272 }
273
274 /* If "def" is from an alu instruction with the opcode "op" and one of it's
275 * sources is a constant, update "def" to be the non-constant source, fill "c"
276 * with the constant and return true. */
277 static bool
parse_alu(nir_ssa_scalar * def,nir_op op,uint64_t * c)278 parse_alu(nir_ssa_scalar *def, nir_op op, uint64_t *c)
279 {
280 if (!nir_ssa_scalar_is_alu(*def) || nir_ssa_scalar_alu_op(*def) != op)
281 return false;
282
283 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(*def, 0);
284 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(*def, 1);
285 if (op != nir_op_ishl && nir_ssa_scalar_is_const(src0)) {
286 *c = nir_ssa_scalar_as_uint(src0);
287 *def = src1;
288 } else if (nir_ssa_scalar_is_const(src1)) {
289 *c = nir_ssa_scalar_as_uint(src1);
290 *def = src0;
291 } else {
292 return false;
293 }
294 return true;
295 }
296
297 /* Parses an offset expression such as "a * 16 + 4" and "(a * 16 + 4) * 64 + 32". */
298 static void
parse_offset(nir_ssa_scalar * base,uint64_t * base_mul,uint64_t * offset)299 parse_offset(nir_ssa_scalar *base, uint64_t *base_mul, uint64_t *offset)
300 {
301 if (nir_ssa_scalar_is_const(*base)) {
302 *offset = nir_ssa_scalar_as_uint(*base);
303 base->def = NULL;
304 return;
305 }
306
307 uint64_t mul = 1;
308 uint64_t add = 0;
309 bool progress = false;
310 do {
311 uint64_t mul2 = 1, add2 = 0;
312
313 progress = parse_alu(base, nir_op_imul, &mul2);
314 mul *= mul2;
315
316 mul2 = 0;
317 progress |= parse_alu(base, nir_op_ishl, &mul2);
318 mul <<= mul2;
319
320 progress |= parse_alu(base, nir_op_iadd, &add2);
321 add += add2 * mul;
322
323 if (nir_ssa_scalar_is_alu(*base) && nir_ssa_scalar_alu_op(*base) == nir_op_mov) {
324 *base = nir_ssa_scalar_chase_alu_src(*base, 0);
325 progress = true;
326 }
327 } while (progress);
328
329 if (base->def->parent_instr->type == nir_instr_type_intrinsic) {
330 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(base->def->parent_instr);
331 if (intrin->intrinsic == nir_intrinsic_load_vulkan_descriptor)
332 base->def = NULL;
333 }
334
335 *base_mul = mul;
336 *offset = add;
337 }
338
339 static unsigned
type_scalar_size_bytes(const struct glsl_type * type)340 type_scalar_size_bytes(const struct glsl_type *type)
341 {
342 assert(glsl_type_is_vector_or_scalar(type) ||
343 glsl_type_is_matrix(type));
344 return glsl_type_is_boolean(type) ? 4u : glsl_get_bit_size(type) / 8u;
345 }
346
347 static uint64_t
mask_sign_extend(uint64_t val,unsigned bit_size)348 mask_sign_extend(uint64_t val, unsigned bit_size)
349 {
350 return (int64_t)(val << (64 - bit_size)) >> (64 - bit_size);
351 }
352
353 static unsigned
add_to_entry_key(nir_ssa_scalar * offset_defs,uint64_t * offset_defs_mul,unsigned offset_def_count,nir_ssa_scalar def,uint64_t mul)354 add_to_entry_key(nir_ssa_scalar *offset_defs, uint64_t *offset_defs_mul,
355 unsigned offset_def_count, nir_ssa_scalar def, uint64_t mul)
356 {
357 mul = mask_sign_extend(mul, def.def->bit_size);
358
359 for (unsigned i = 0; i <= offset_def_count; i++) {
360 if (i == offset_def_count || def.def->index > offset_defs[i].def->index) {
361 /* insert before i */
362 memmove(offset_defs + i + 1, offset_defs + i,
363 (offset_def_count - i) * sizeof(nir_ssa_scalar));
364 memmove(offset_defs_mul + i + 1, offset_defs_mul + i,
365 (offset_def_count - i) * sizeof(uint64_t));
366 offset_defs[i] = def;
367 offset_defs_mul[i] = mul;
368 return 1;
369 } else if (def.def == offset_defs[i].def &&
370 def.comp == offset_defs[i].comp) {
371 /* merge with offset_def at i */
372 offset_defs_mul[i] += mul;
373 return 0;
374 }
375 }
376 unreachable("Unreachable.");
377 return 0;
378 }
379
380 static struct entry_key *
create_entry_key_from_deref(void * mem_ctx,struct vectorize_ctx * ctx,nir_deref_path * path,uint64_t * offset_base)381 create_entry_key_from_deref(void *mem_ctx,
382 struct vectorize_ctx *ctx,
383 nir_deref_path *path,
384 uint64_t *offset_base)
385 {
386 unsigned path_len = 0;
387 while (path->path[path_len])
388 path_len++;
389
390 nir_ssa_scalar offset_defs_stack[32];
391 uint64_t offset_defs_mul_stack[32];
392 nir_ssa_scalar *offset_defs = offset_defs_stack;
393 uint64_t *offset_defs_mul = offset_defs_mul_stack;
394 if (path_len > 32) {
395 offset_defs = malloc(path_len * sizeof(nir_ssa_scalar));
396 offset_defs_mul = malloc(path_len * sizeof(uint64_t));
397 }
398 unsigned offset_def_count = 0;
399
400 struct entry_key* key = ralloc(mem_ctx, struct entry_key);
401 key->resource = NULL;
402 key->var = NULL;
403 *offset_base = 0;
404
405 for (unsigned i = 0; i < path_len; i++) {
406 nir_deref_instr *parent = i ? path->path[i - 1] : NULL;
407 nir_deref_instr *deref = path->path[i];
408
409 switch (deref->deref_type) {
410 case nir_deref_type_var: {
411 assert(!parent);
412 key->var = deref->var;
413 break;
414 }
415 case nir_deref_type_array:
416 case nir_deref_type_ptr_as_array: {
417 assert(parent);
418 nir_ssa_def *index = deref->arr.index.ssa;
419 uint32_t stride = nir_deref_instr_array_stride(deref);
420
421 nir_ssa_scalar base = {.def=index, .comp=0};
422 uint64_t offset = 0, base_mul = 1;
423 parse_offset(&base, &base_mul, &offset);
424 offset = mask_sign_extend(offset, index->bit_size);
425
426 *offset_base += offset * stride;
427 if (base.def) {
428 offset_def_count += add_to_entry_key(offset_defs, offset_defs_mul,
429 offset_def_count,
430 base, base_mul * stride);
431 }
432 break;
433 }
434 case nir_deref_type_struct: {
435 assert(parent);
436 int offset = glsl_get_struct_field_offset(parent->type, deref->strct.index);
437 *offset_base += offset;
438 break;
439 }
440 case nir_deref_type_cast: {
441 if (!parent)
442 key->resource = deref->parent.ssa;
443 break;
444 }
445 default:
446 unreachable("Unhandled deref type");
447 }
448 }
449
450 key->offset_def_count = offset_def_count;
451 key->offset_defs = ralloc_array(mem_ctx, nir_ssa_scalar, offset_def_count);
452 key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, offset_def_count);
453 memcpy(key->offset_defs, offset_defs, offset_def_count * sizeof(nir_ssa_scalar));
454 memcpy(key->offset_defs_mul, offset_defs_mul, offset_def_count * sizeof(uint64_t));
455
456 if (offset_defs != offset_defs_stack)
457 free(offset_defs);
458 if (offset_defs_mul != offset_defs_mul_stack)
459 free(offset_defs_mul);
460
461 return key;
462 }
463
464 static unsigned
parse_entry_key_from_offset(struct entry_key * key,unsigned size,unsigned left,nir_ssa_scalar base,uint64_t base_mul,uint64_t * offset)465 parse_entry_key_from_offset(struct entry_key *key, unsigned size, unsigned left,
466 nir_ssa_scalar base, uint64_t base_mul, uint64_t *offset)
467 {
468 uint64_t new_mul;
469 uint64_t new_offset;
470 parse_offset(&base, &new_mul, &new_offset);
471 *offset += new_offset * base_mul;
472
473 if (!base.def)
474 return 0;
475
476 base_mul *= new_mul;
477
478 assert(left >= 1);
479
480 if (left >= 2) {
481 if (nir_ssa_scalar_is_alu(base) && nir_ssa_scalar_alu_op(base) == nir_op_iadd) {
482 nir_ssa_scalar src0 = nir_ssa_scalar_chase_alu_src(base, 0);
483 nir_ssa_scalar src1 = nir_ssa_scalar_chase_alu_src(base, 1);
484 unsigned amount = parse_entry_key_from_offset(key, size, left - 1, src0, base_mul, offset);
485 amount += parse_entry_key_from_offset(key, size + amount, left - amount, src1, base_mul, offset);
486 return amount;
487 }
488 }
489
490 return add_to_entry_key(key->offset_defs, key->offset_defs_mul, size, base, base_mul);
491 }
492
493 static struct entry_key *
create_entry_key_from_offset(void * mem_ctx,nir_ssa_def * base,uint64_t base_mul,uint64_t * offset)494 create_entry_key_from_offset(void *mem_ctx, nir_ssa_def *base, uint64_t base_mul, uint64_t *offset)
495 {
496 struct entry_key *key = ralloc(mem_ctx, struct entry_key);
497 key->resource = NULL;
498 key->var = NULL;
499 if (base) {
500 nir_ssa_scalar offset_defs[32];
501 uint64_t offset_defs_mul[32];
502 key->offset_defs = offset_defs;
503 key->offset_defs_mul = offset_defs_mul;
504
505 nir_ssa_scalar scalar = {.def=base, .comp=0};
506 key->offset_def_count = parse_entry_key_from_offset(key, 0, 32, scalar, base_mul, offset);
507
508 key->offset_defs = ralloc_array(mem_ctx, nir_ssa_scalar, key->offset_def_count);
509 key->offset_defs_mul = ralloc_array(mem_ctx, uint64_t, key->offset_def_count);
510 memcpy(key->offset_defs, offset_defs, key->offset_def_count * sizeof(nir_ssa_scalar));
511 memcpy(key->offset_defs_mul, offset_defs_mul, key->offset_def_count * sizeof(uint64_t));
512 } else {
513 key->offset_def_count = 0;
514 key->offset_defs = NULL;
515 key->offset_defs_mul = NULL;
516 }
517 return key;
518 }
519
520 static nir_variable_mode
get_variable_mode(struct entry * entry)521 get_variable_mode(struct entry *entry)
522 {
523 if (entry->info->mode)
524 return entry->info->mode;
525 assert(entry->deref && util_bitcount(entry->deref->modes) == 1);
526 return entry->deref->modes;
527 }
528
529 static unsigned
mode_to_index(nir_variable_mode mode)530 mode_to_index(nir_variable_mode mode)
531 {
532 assert(util_bitcount(mode) == 1);
533
534 /* Globals and SSBOs should be tracked together */
535 if (mode == nir_var_mem_global)
536 mode = nir_var_mem_ssbo;
537
538 return ffs(mode) - 1;
539 }
540
541 static nir_variable_mode
aliasing_modes(nir_variable_mode modes)542 aliasing_modes(nir_variable_mode modes)
543 {
544 /* Global and SSBO can alias */
545 if (modes & (nir_var_mem_ssbo | nir_var_mem_global))
546 modes |= nir_var_mem_ssbo | nir_var_mem_global;
547 return modes;
548 }
549
550 static void
calc_alignment(struct entry * entry)551 calc_alignment(struct entry *entry)
552 {
553 uint32_t align_mul = 31;
554 for (unsigned i = 0; i < entry->key->offset_def_count; i++) {
555 if (entry->key->offset_defs_mul[i])
556 align_mul = MIN2(align_mul, ffsll(entry->key->offset_defs_mul[i]));
557 }
558
559 entry->align_mul = 1u << (align_mul - 1);
560 bool has_align = nir_intrinsic_infos[entry->intrin->intrinsic].index_map[NIR_INTRINSIC_ALIGN_MUL];
561 if (!has_align || entry->align_mul >= nir_intrinsic_align_mul(entry->intrin)) {
562 entry->align_offset = entry->offset % entry->align_mul;
563 } else {
564 entry->align_mul = nir_intrinsic_align_mul(entry->intrin);
565 entry->align_offset = nir_intrinsic_align_offset(entry->intrin);
566 }
567 }
568
569 static struct entry *
create_entry(struct vectorize_ctx * ctx,const struct intrinsic_info * info,nir_intrinsic_instr * intrin)570 create_entry(struct vectorize_ctx *ctx,
571 const struct intrinsic_info *info,
572 nir_intrinsic_instr *intrin)
573 {
574 struct entry *entry = rzalloc(ctx, struct entry);
575 entry->intrin = intrin;
576 entry->instr = &intrin->instr;
577 entry->info = info;
578 entry->is_store = entry->info->value_src >= 0;
579
580 if (entry->info->deref_src >= 0) {
581 entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]);
582 nir_deref_path path;
583 nir_deref_path_init(&path, entry->deref, NULL);
584 entry->key = create_entry_key_from_deref(entry, ctx, &path, &entry->offset);
585 nir_deref_path_finish(&path);
586 } else {
587 nir_ssa_def *base = entry->info->base_src >= 0 ?
588 intrin->src[entry->info->base_src].ssa : NULL;
589 uint64_t offset = 0;
590 if (nir_intrinsic_has_base(intrin))
591 offset += nir_intrinsic_base(intrin);
592 entry->key = create_entry_key_from_offset(entry, base, 1, &offset);
593 entry->offset = offset;
594
595 if (base)
596 entry->offset = mask_sign_extend(entry->offset, base->bit_size);
597 }
598
599 if (entry->info->resource_src >= 0)
600 entry->key->resource = intrin->src[entry->info->resource_src].ssa;
601
602 if (nir_intrinsic_has_access(intrin))
603 entry->access = nir_intrinsic_access(intrin);
604 else if (entry->key->var)
605 entry->access = entry->key->var->data.access;
606
607 if (nir_intrinsic_can_reorder(intrin))
608 entry->access |= ACCESS_CAN_REORDER;
609
610 uint32_t restrict_modes = nir_var_shader_in | nir_var_shader_out;
611 restrict_modes |= nir_var_shader_temp | nir_var_function_temp;
612 restrict_modes |= nir_var_uniform | nir_var_mem_push_const;
613 restrict_modes |= nir_var_system_value | nir_var_mem_shared;
614 if (get_variable_mode(entry) & restrict_modes)
615 entry->access |= ACCESS_RESTRICT;
616
617 calc_alignment(entry);
618
619 return entry;
620 }
621
622 static nir_deref_instr *
cast_deref(nir_builder * b,unsigned num_components,unsigned bit_size,nir_deref_instr * deref)623 cast_deref(nir_builder *b, unsigned num_components, unsigned bit_size, nir_deref_instr *deref)
624 {
625 if (glsl_get_components(deref->type) == num_components &&
626 type_scalar_size_bytes(deref->type)*8u == bit_size)
627 return deref;
628
629 enum glsl_base_type types[] = {
630 GLSL_TYPE_UINT8, GLSL_TYPE_UINT16, GLSL_TYPE_UINT, GLSL_TYPE_UINT64};
631 enum glsl_base_type base = types[ffs(bit_size / 8u) - 1u];
632 const struct glsl_type *type = glsl_vector_type(base, num_components);
633
634 if (deref->type == type)
635 return deref;
636
637 return nir_build_deref_cast(b, &deref->dest.ssa, deref->modes, type, 0);
638 }
639
640 /* Return true if "new_bit_size" is a usable bit size for a vectorized load/store
641 * of "low" and "high". */
642 static bool
new_bitsize_acceptable(struct vectorize_ctx * ctx,unsigned new_bit_size,struct entry * low,struct entry * high,unsigned size)643 new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
644 struct entry *low, struct entry *high, unsigned size)
645 {
646 if (size % new_bit_size != 0)
647 return false;
648
649 unsigned new_num_components = size / new_bit_size;
650 if (!nir_num_components_valid(new_num_components))
651 return false;
652
653 unsigned high_offset = high->offset_signed - low->offset_signed;
654
655 /* check nir_extract_bits limitations */
656 unsigned common_bit_size = MIN2(get_bit_size(low), get_bit_size(high));
657 common_bit_size = MIN2(common_bit_size, new_bit_size);
658 if (high_offset > 0)
659 common_bit_size = MIN2(common_bit_size, (1u << (ffs(high_offset * 8) - 1)));
660 if (new_bit_size / common_bit_size > NIR_MAX_VEC_COMPONENTS)
661 return false;
662
663 if (!ctx->options->callback(low->align_mul,
664 low->align_offset,
665 new_bit_size, new_num_components,
666 low->intrin, high->intrin,
667 ctx->options->cb_data))
668 return false;
669
670 if (low->is_store) {
671 unsigned low_size = low->intrin->num_components * get_bit_size(low);
672 unsigned high_size = high->intrin->num_components * get_bit_size(high);
673
674 if (low_size % new_bit_size != 0)
675 return false;
676 if (high_size % new_bit_size != 0)
677 return false;
678
679 unsigned write_mask = nir_intrinsic_write_mask(low->intrin);
680 if (!nir_component_mask_can_reinterpret(write_mask, get_bit_size(low), new_bit_size))
681 return false;
682
683 write_mask = nir_intrinsic_write_mask(high->intrin);
684 if (!nir_component_mask_can_reinterpret(write_mask, get_bit_size(high), new_bit_size))
685 return false;
686 }
687
688 return true;
689 }
690
subtract_deref(nir_builder * b,nir_deref_instr * deref,int64_t offset)691 static nir_deref_instr *subtract_deref(nir_builder *b, nir_deref_instr *deref, int64_t offset)
692 {
693 /* avoid adding another deref to the path */
694 if (deref->deref_type == nir_deref_type_ptr_as_array &&
695 nir_src_is_const(deref->arr.index) &&
696 offset % nir_deref_instr_array_stride(deref) == 0) {
697 unsigned stride = nir_deref_instr_array_stride(deref);
698 nir_ssa_def *index = nir_imm_intN_t(b, nir_src_as_int(deref->arr.index) - offset / stride,
699 deref->dest.ssa.bit_size);
700 return nir_build_deref_ptr_as_array(b, nir_deref_instr_parent(deref), index);
701 }
702
703 if (deref->deref_type == nir_deref_type_array &&
704 nir_src_is_const(deref->arr.index)) {
705 nir_deref_instr *parent = nir_deref_instr_parent(deref);
706 unsigned stride = glsl_get_explicit_stride(parent->type);
707 if (offset % stride == 0)
708 return nir_build_deref_array_imm(
709 b, parent, nir_src_as_int(deref->arr.index) - offset / stride);
710 }
711
712
713 deref = nir_build_deref_cast(b, &deref->dest.ssa, deref->modes,
714 glsl_scalar_type(GLSL_TYPE_UINT8), 1);
715 return nir_build_deref_ptr_as_array(
716 b, deref, nir_imm_intN_t(b, -offset, deref->dest.ssa.bit_size));
717 }
718
719 static void
vectorize_loads(nir_builder * b,struct vectorize_ctx * ctx,struct entry * low,struct entry * high,struct entry * first,struct entry * second,unsigned new_bit_size,unsigned new_num_components,unsigned high_start)720 vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx,
721 struct entry *low, struct entry *high,
722 struct entry *first, struct entry *second,
723 unsigned new_bit_size, unsigned new_num_components,
724 unsigned high_start)
725 {
726 unsigned low_bit_size = get_bit_size(low);
727 unsigned high_bit_size = get_bit_size(high);
728 bool low_bool = low->intrin->dest.ssa.bit_size == 1;
729 bool high_bool = high->intrin->dest.ssa.bit_size == 1;
730 nir_ssa_def *data = &first->intrin->dest.ssa;
731
732 b->cursor = nir_after_instr(first->instr);
733
734 /* update the load's destination size and extract data for each of the original loads */
735 data->num_components = new_num_components;
736 data->bit_size = new_bit_size;
737
738 nir_ssa_def *low_def = nir_extract_bits(
739 b, &data, 1, 0, low->intrin->num_components, low_bit_size);
740 nir_ssa_def *high_def = nir_extract_bits(
741 b, &data, 1, high_start, high->intrin->num_components, high_bit_size);
742
743 /* convert booleans */
744 low_def = low_bool ? nir_i2b(b, low_def) : nir_mov(b, low_def);
745 high_def = high_bool ? nir_i2b(b, high_def) : nir_mov(b, high_def);
746
747 /* update uses */
748 if (first == low) {
749 nir_ssa_def_rewrite_uses_after(&low->intrin->dest.ssa, low_def,
750 high_def->parent_instr);
751 nir_ssa_def_rewrite_uses(&high->intrin->dest.ssa, high_def);
752 } else {
753 nir_ssa_def_rewrite_uses(&low->intrin->dest.ssa, low_def);
754 nir_ssa_def_rewrite_uses_after(&high->intrin->dest.ssa, high_def,
755 high_def->parent_instr);
756 }
757
758 /* update the intrinsic */
759 first->intrin->num_components = new_num_components;
760
761 const struct intrinsic_info *info = first->info;
762
763 /* update the offset */
764 if (first != low && info->base_src >= 0) {
765 /* let nir_opt_algebraic() remove this addition. this doesn't have much
766 * issues with subtracting 16 from expressions like "(i + 1) * 16" because
767 * nir_opt_algebraic() turns them into "i * 16 + 16" */
768 b->cursor = nir_before_instr(first->instr);
769
770 nir_ssa_def *new_base = first->intrin->src[info->base_src].ssa;
771 new_base = nir_iadd_imm(b, new_base, -(int)(high_start / 8u));
772
773 nir_instr_rewrite_src(first->instr, &first->intrin->src[info->base_src],
774 nir_src_for_ssa(new_base));
775 }
776
777 /* update the deref */
778 if (info->deref_src >= 0) {
779 b->cursor = nir_before_instr(first->instr);
780
781 nir_deref_instr *deref = nir_src_as_deref(first->intrin->src[info->deref_src]);
782 if (first != low && high_start != 0)
783 deref = subtract_deref(b, deref, high_start / 8u);
784 first->deref = cast_deref(b, new_num_components, new_bit_size, deref);
785
786 nir_instr_rewrite_src(first->instr, &first->intrin->src[info->deref_src],
787 nir_src_for_ssa(&first->deref->dest.ssa));
788 }
789
790 /* update align */
791 if (nir_intrinsic_has_range_base(first->intrin)) {
792 uint32_t low_base = nir_intrinsic_range_base(low->intrin);
793 uint32_t high_base = nir_intrinsic_range_base(high->intrin);
794 uint32_t low_end = low_base + nir_intrinsic_range(low->intrin);
795 uint32_t high_end = high_base + nir_intrinsic_range(high->intrin);
796
797 nir_intrinsic_set_range_base(first->intrin, low_base);
798 nir_intrinsic_set_range(first->intrin, MAX2(low_end, high_end) - low_base);
799 }
800
801 first->key = low->key;
802 first->offset = low->offset;
803
804 first->align_mul = low->align_mul;
805 first->align_offset = low->align_offset;
806
807 nir_instr_remove(second->instr);
808 }
809
810 static void
vectorize_stores(nir_builder * b,struct vectorize_ctx * ctx,struct entry * low,struct entry * high,struct entry * first,struct entry * second,unsigned new_bit_size,unsigned new_num_components,unsigned high_start)811 vectorize_stores(nir_builder *b, struct vectorize_ctx *ctx,
812 struct entry *low, struct entry *high,
813 struct entry *first, struct entry *second,
814 unsigned new_bit_size, unsigned new_num_components,
815 unsigned high_start)
816 {
817 ASSERTED unsigned low_size = low->intrin->num_components * get_bit_size(low);
818 assert(low_size % new_bit_size == 0);
819
820 b->cursor = nir_before_instr(second->instr);
821
822 /* get new writemasks */
823 uint32_t low_write_mask = nir_intrinsic_write_mask(low->intrin);
824 uint32_t high_write_mask = nir_intrinsic_write_mask(high->intrin);
825 low_write_mask = nir_component_mask_reinterpret(low_write_mask,
826 get_bit_size(low),
827 new_bit_size);
828 high_write_mask = nir_component_mask_reinterpret(high_write_mask,
829 get_bit_size(high),
830 new_bit_size);
831 high_write_mask <<= high_start / new_bit_size;
832
833 uint32_t write_mask = low_write_mask | high_write_mask;
834
835 /* convert booleans */
836 nir_ssa_def *low_val = low->intrin->src[low->info->value_src].ssa;
837 nir_ssa_def *high_val = high->intrin->src[high->info->value_src].ssa;
838 low_val = low_val->bit_size == 1 ? nir_b2i(b, low_val, 32) : low_val;
839 high_val = high_val->bit_size == 1 ? nir_b2i(b, high_val, 32) : high_val;
840
841 /* combine the data */
842 nir_ssa_def *data_channels[NIR_MAX_VEC_COMPONENTS];
843 for (unsigned i = 0; i < new_num_components; i++) {
844 bool set_low = low_write_mask & (1 << i);
845 bool set_high = high_write_mask & (1 << i);
846
847 if (set_low && (!set_high || low == second)) {
848 unsigned offset = i * new_bit_size;
849 data_channels[i] = nir_extract_bits(b, &low_val, 1, offset, 1, new_bit_size);
850 } else if (set_high) {
851 assert(!set_low || high == second);
852 unsigned offset = i * new_bit_size - high_start;
853 data_channels[i] = nir_extract_bits(b, &high_val, 1, offset, 1, new_bit_size);
854 } else {
855 data_channels[i] = nir_ssa_undef(b, 1, new_bit_size);
856 }
857 }
858 nir_ssa_def *data = nir_vec(b, data_channels, new_num_components);
859
860 /* update the intrinsic */
861 nir_intrinsic_set_write_mask(second->intrin, write_mask);
862 second->intrin->num_components = data->num_components;
863
864 const struct intrinsic_info *info = second->info;
865 assert(info->value_src >= 0);
866 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->value_src],
867 nir_src_for_ssa(data));
868
869 /* update the offset */
870 if (second != low && info->base_src >= 0)
871 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->base_src],
872 low->intrin->src[info->base_src]);
873
874 /* update the deref */
875 if (info->deref_src >= 0) {
876 b->cursor = nir_before_instr(second->instr);
877 second->deref = cast_deref(b, new_num_components, new_bit_size,
878 nir_src_as_deref(low->intrin->src[info->deref_src]));
879 nir_instr_rewrite_src(second->instr, &second->intrin->src[info->deref_src],
880 nir_src_for_ssa(&second->deref->dest.ssa));
881 }
882
883 /* update base/align */
884 if (second != low && nir_intrinsic_has_base(second->intrin))
885 nir_intrinsic_set_base(second->intrin, nir_intrinsic_base(low->intrin));
886
887 second->key = low->key;
888 second->offset = low->offset;
889
890 second->align_mul = low->align_mul;
891 second->align_offset = low->align_offset;
892
893 list_del(&first->head);
894 nir_instr_remove(first->instr);
895 }
896
897 /* Returns true if it can prove that "a" and "b" point to different bindings
898 * and either one uses ACCESS_RESTRICT. */
899 static bool
bindings_different_restrict(nir_shader * shader,struct entry * a,struct entry * b)900 bindings_different_restrict(nir_shader *shader, struct entry *a, struct entry *b)
901 {
902 bool different_bindings = false;
903 nir_variable *a_var = NULL, *b_var = NULL;
904 if (a->key->resource && b->key->resource) {
905 nir_binding a_res = nir_chase_binding(nir_src_for_ssa(a->key->resource));
906 nir_binding b_res = nir_chase_binding(nir_src_for_ssa(b->key->resource));
907 if (!a_res.success || !b_res.success)
908 return false;
909
910 if (a_res.num_indices != b_res.num_indices ||
911 a_res.desc_set != b_res.desc_set ||
912 a_res.binding != b_res.binding)
913 different_bindings = true;
914
915 for (unsigned i = 0; i < a_res.num_indices; i++) {
916 if (nir_src_is_const(a_res.indices[i]) && nir_src_is_const(b_res.indices[i]) &&
917 nir_src_as_uint(a_res.indices[i]) != nir_src_as_uint(b_res.indices[i]))
918 different_bindings = true;
919 }
920
921 if (different_bindings) {
922 a_var = nir_get_binding_variable(shader, a_res);
923 b_var = nir_get_binding_variable(shader, b_res);
924 }
925 } else if (a->key->var && b->key->var) {
926 a_var = a->key->var;
927 b_var = b->key->var;
928 different_bindings = a_var != b_var;
929 } else {
930 return false;
931 }
932
933 unsigned a_access = a->access | (a_var ? a_var->data.access : 0);
934 unsigned b_access = b->access | (b_var ? b_var->data.access : 0);
935
936 return different_bindings &&
937 ((a_access | b_access) & ACCESS_RESTRICT);
938 }
939
940 static int64_t
compare_entries(struct entry * a,struct entry * b)941 compare_entries(struct entry *a, struct entry *b)
942 {
943 if (!entry_key_equals(a->key, b->key))
944 return INT64_MAX;
945 return b->offset_signed - a->offset_signed;
946 }
947
948 static bool
may_alias(nir_shader * shader,struct entry * a,struct entry * b)949 may_alias(nir_shader *shader, struct entry *a, struct entry *b)
950 {
951 assert(mode_to_index(get_variable_mode(a)) ==
952 mode_to_index(get_variable_mode(b)));
953
954 if ((a->access | b->access) & ACCESS_CAN_REORDER)
955 return false;
956
957 /* if the resources/variables are definitively different and both have
958 * ACCESS_RESTRICT, we can assume they do not alias. */
959 if (bindings_different_restrict(shader, a, b))
960 return false;
961
962 /* we can't compare offsets if the resources/variables might be different */
963 if (a->key->var != b->key->var || a->key->resource != b->key->resource)
964 return true;
965
966 /* use adjacency information */
967 /* TODO: we can look closer at the entry keys */
968 int64_t diff = compare_entries(a, b);
969 if (diff != INT64_MAX) {
970 /* with atomics, intrin->num_components can be 0 */
971 if (diff < 0)
972 return llabs(diff) < MAX2(b->intrin->num_components, 1u) * (get_bit_size(b) / 8u);
973 else
974 return diff < MAX2(a->intrin->num_components, 1u) * (get_bit_size(a) / 8u);
975 }
976
977 /* TODO: we can use deref information */
978
979 return true;
980 }
981
982 static bool
check_for_aliasing(struct vectorize_ctx * ctx,struct entry * first,struct entry * second)983 check_for_aliasing(struct vectorize_ctx *ctx, struct entry *first, struct entry *second)
984 {
985 nir_variable_mode mode = get_variable_mode(first);
986 if (mode & (nir_var_uniform | nir_var_system_value |
987 nir_var_mem_push_const | nir_var_mem_ubo))
988 return false;
989
990 unsigned mode_index = mode_to_index(mode);
991 if (first->is_store) {
992 /* find first entry that aliases "first" */
993 list_for_each_entry_from(struct entry, next, first, &ctx->entries[mode_index], head) {
994 if (next == first)
995 continue;
996 if (next == second)
997 return false;
998 if (may_alias(ctx->shader, first, next))
999 return true;
1000 }
1001 } else {
1002 /* find previous store that aliases this load */
1003 list_for_each_entry_from_rev(struct entry, prev, second, &ctx->entries[mode_index], head) {
1004 if (prev == second)
1005 continue;
1006 if (prev == first)
1007 return false;
1008 if (prev->is_store && may_alias(ctx->shader, second, prev))
1009 return true;
1010 }
1011 }
1012
1013 return false;
1014 }
1015
1016 static uint64_t
calc_gcd(uint64_t a,uint64_t b)1017 calc_gcd(uint64_t a, uint64_t b)
1018 {
1019 while (b != 0) {
1020 int tmp_a = a;
1021 a = b;
1022 b = tmp_a % b;
1023 }
1024 return a;
1025 }
1026
1027 static uint64_t
round_down(uint64_t a,uint64_t b)1028 round_down(uint64_t a, uint64_t b)
1029 {
1030 return a / b * b;
1031 }
1032
1033 static bool
addition_wraps(uint64_t a,uint64_t b,unsigned bits)1034 addition_wraps(uint64_t a, uint64_t b, unsigned bits)
1035 {
1036 uint64_t mask = BITFIELD64_MASK(bits);
1037 return ((a + b) & mask) < (a & mask);
1038 }
1039
1040 /* Return true if the addition of "low"'s offset and "high_offset" could wrap
1041 * around.
1042 *
1043 * This is to prevent a situation where the hardware considers the high load
1044 * out-of-bounds after vectorization if the low load is out-of-bounds, even if
1045 * the wrap-around from the addition could make the high load in-bounds.
1046 */
1047 static bool
check_for_robustness(struct vectorize_ctx * ctx,struct entry * low,uint64_t high_offset)1048 check_for_robustness(struct vectorize_ctx *ctx, struct entry *low, uint64_t high_offset)
1049 {
1050 nir_variable_mode mode = get_variable_mode(low);
1051 if (!(mode & ctx->options->robust_modes))
1052 return false;
1053
1054 /* First, try to use alignment information in case the application provided some. If the addition
1055 * of the maximum offset of the low load and "high_offset" wraps around, we can't combine the low
1056 * and high loads.
1057 */
1058 uint64_t max_low = round_down(UINT64_MAX, low->align_mul) + low->align_offset;
1059 if (!addition_wraps(max_low, high_offset, 64))
1060 return false;
1061
1062 /* We can't obtain addition_bits */
1063 if (low->info->base_src < 0)
1064 return true;
1065
1066 /* Second, use information about the factors from address calculation (offset_defs_mul). These
1067 * are not guaranteed to be power-of-2.
1068 */
1069 uint64_t stride = 0;
1070 for (unsigned i = 0; i < low->key->offset_def_count; i++)
1071 stride = calc_gcd(low->key->offset_defs_mul[i], stride);
1072
1073 unsigned addition_bits = low->intrin->src[low->info->base_src].ssa->bit_size;
1074 /* low's offset must be a multiple of "stride" plus "low->offset". */
1075 max_low = low->offset;
1076 if (stride)
1077 max_low = round_down(BITFIELD64_MASK(addition_bits), stride) + (low->offset % stride);
1078 return addition_wraps(max_low, high_offset, addition_bits);
1079 }
1080
1081 static bool
is_strided_vector(const struct glsl_type * type)1082 is_strided_vector(const struct glsl_type *type)
1083 {
1084 if (glsl_type_is_vector(type)) {
1085 unsigned explicit_stride = glsl_get_explicit_stride(type);
1086 return explicit_stride != 0 && explicit_stride !=
1087 type_scalar_size_bytes(glsl_get_array_element(type));
1088 } else {
1089 return false;
1090 }
1091 }
1092
1093 static bool
try_vectorize(nir_function_impl * impl,struct vectorize_ctx * ctx,struct entry * low,struct entry * high,struct entry * first,struct entry * second)1094 try_vectorize(nir_function_impl *impl, struct vectorize_ctx *ctx,
1095 struct entry *low, struct entry *high,
1096 struct entry *first, struct entry *second)
1097 {
1098 if (!(get_variable_mode(first) & ctx->options->modes) ||
1099 !(get_variable_mode(second) & ctx->options->modes))
1100 return false;
1101
1102 if (check_for_aliasing(ctx, first, second))
1103 return false;
1104
1105 uint64_t diff = high->offset_signed - low->offset_signed;
1106 if (check_for_robustness(ctx, low, diff))
1107 return false;
1108
1109 /* we can only vectorize non-volatile loads/stores of the same type and with
1110 * the same access */
1111 if (first->info != second->info || first->access != second->access ||
1112 (first->access & ACCESS_VOLATILE) || first->info->is_atomic)
1113 return false;
1114
1115 /* don't attempt to vectorize accesses of row-major matrix columns */
1116 if (first->deref) {
1117 const struct glsl_type *first_type = first->deref->type;
1118 const struct glsl_type *second_type = second->deref->type;
1119 if (is_strided_vector(first_type) || is_strided_vector(second_type))
1120 return false;
1121 }
1122
1123 /* gather information */
1124 unsigned low_bit_size = get_bit_size(low);
1125 unsigned high_bit_size = get_bit_size(high);
1126 unsigned low_size = low->intrin->num_components * low_bit_size;
1127 unsigned high_size = high->intrin->num_components * high_bit_size;
1128 unsigned new_size = MAX2(diff * 8u + high_size, low_size);
1129
1130 /* find a good bit size for the new load/store */
1131 unsigned new_bit_size = 0;
1132 if (new_bitsize_acceptable(ctx, low_bit_size, low, high, new_size)) {
1133 new_bit_size = low_bit_size;
1134 } else if (low_bit_size != high_bit_size &&
1135 new_bitsize_acceptable(ctx, high_bit_size, low, high, new_size)) {
1136 new_bit_size = high_bit_size;
1137 } else {
1138 new_bit_size = 64;
1139 for (; new_bit_size >= 8; new_bit_size /= 2) {
1140 /* don't repeat trying out bitsizes */
1141 if (new_bit_size == low_bit_size || new_bit_size == high_bit_size)
1142 continue;
1143 if (new_bitsize_acceptable(ctx, new_bit_size, low, high, new_size))
1144 break;
1145 }
1146 if (new_bit_size < 8)
1147 return false;
1148 }
1149 unsigned new_num_components = new_size / new_bit_size;
1150
1151 /* vectorize the loads/stores */
1152 nir_builder b;
1153 nir_builder_init(&b, impl);
1154
1155 if (first->is_store)
1156 vectorize_stores(&b, ctx, low, high, first, second,
1157 new_bit_size, new_num_components, diff * 8u);
1158 else
1159 vectorize_loads(&b, ctx, low, high, first, second,
1160 new_bit_size, new_num_components, diff * 8u);
1161
1162 return true;
1163 }
1164
1165 static bool
update_align(struct entry * entry)1166 update_align(struct entry *entry)
1167 {
1168 if (nir_intrinsic_has_align_mul(entry->intrin) &&
1169 (entry->align_mul != nir_intrinsic_align_mul(entry->intrin) ||
1170 entry->align_offset != nir_intrinsic_align_offset(entry->intrin))) {
1171 nir_intrinsic_set_align(entry->intrin, entry->align_mul, entry->align_offset);
1172 return true;
1173 }
1174 return false;
1175 }
1176
1177 static bool
vectorize_sorted_entries(struct vectorize_ctx * ctx,nir_function_impl * impl,struct util_dynarray * arr)1178 vectorize_sorted_entries(struct vectorize_ctx *ctx, nir_function_impl *impl,
1179 struct util_dynarray *arr)
1180 {
1181 unsigned num_entries = util_dynarray_num_elements(arr, struct entry *);
1182
1183 bool progress = false;
1184 for (unsigned first_idx = 0; first_idx < num_entries; first_idx++) {
1185 struct entry *low = *util_dynarray_element(arr, struct entry *, first_idx);
1186 if (!low)
1187 continue;
1188
1189 for (unsigned second_idx = first_idx + 1; second_idx < num_entries; second_idx++) {
1190 struct entry *high = *util_dynarray_element(arr, struct entry *, second_idx);
1191 if (!high)
1192 continue;
1193
1194 uint64_t diff = high->offset_signed - low->offset_signed;
1195 if (diff > get_bit_size(low) / 8u * low->intrin->num_components)
1196 break;
1197
1198 struct entry *first = low->index < high->index ? low : high;
1199 struct entry *second = low->index < high->index ? high : low;
1200
1201 if (try_vectorize(impl, ctx, low, high, first, second)) {
1202 low = low->is_store ? second : first;
1203 *util_dynarray_element(arr, struct entry *, second_idx) = NULL;
1204 progress = true;
1205 }
1206 }
1207
1208 *util_dynarray_element(arr, struct entry *, first_idx) = low;
1209 }
1210
1211 return progress;
1212 }
1213
1214 static bool
vectorize_entries(struct vectorize_ctx * ctx,nir_function_impl * impl,struct hash_table * ht)1215 vectorize_entries(struct vectorize_ctx *ctx, nir_function_impl *impl, struct hash_table *ht)
1216 {
1217 if (!ht)
1218 return false;
1219
1220 bool progress = false;
1221 hash_table_foreach(ht, entry) {
1222 struct util_dynarray *arr = entry->data;
1223 if (!arr->size)
1224 continue;
1225
1226 qsort(util_dynarray_begin(arr),
1227 util_dynarray_num_elements(arr, struct entry *),
1228 sizeof(struct entry *), &sort_entries);
1229
1230 while (vectorize_sorted_entries(ctx, impl, arr))
1231 progress = true;
1232
1233 util_dynarray_foreach(arr, struct entry *, elem) {
1234 if (*elem)
1235 progress |= update_align(*elem);
1236 }
1237 }
1238
1239 _mesa_hash_table_clear(ht, delete_entry_dynarray);
1240
1241 return progress;
1242 }
1243
1244 static bool
handle_barrier(struct vectorize_ctx * ctx,bool * progress,nir_function_impl * impl,nir_instr * instr)1245 handle_barrier(struct vectorize_ctx *ctx, bool *progress, nir_function_impl *impl, nir_instr *instr)
1246 {
1247 unsigned modes = 0;
1248 bool acquire = true;
1249 bool release = true;
1250 if (instr->type == nir_instr_type_intrinsic) {
1251 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1252 switch (intrin->intrinsic) {
1253 case nir_intrinsic_group_memory_barrier:
1254 case nir_intrinsic_memory_barrier:
1255 modes = nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global;
1256 break;
1257 /* prevent speculative loads/stores */
1258 case nir_intrinsic_discard_if:
1259 case nir_intrinsic_discard:
1260 case nir_intrinsic_terminate_if:
1261 case nir_intrinsic_terminate:
1262 modes = nir_var_all;
1263 break;
1264 case nir_intrinsic_demote_if:
1265 case nir_intrinsic_demote:
1266 acquire = false;
1267 modes = nir_var_all;
1268 break;
1269 case nir_intrinsic_memory_barrier_buffer:
1270 modes = nir_var_mem_ssbo | nir_var_mem_global;
1271 break;
1272 case nir_intrinsic_memory_barrier_shared:
1273 modes = nir_var_mem_shared;
1274 break;
1275 case nir_intrinsic_scoped_barrier:
1276 if (nir_intrinsic_memory_scope(intrin) == NIR_SCOPE_NONE)
1277 break;
1278
1279 modes = nir_intrinsic_memory_modes(intrin) & (nir_var_mem_ssbo |
1280 nir_var_mem_shared |
1281 nir_var_mem_global);
1282 acquire = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_ACQUIRE;
1283 release = nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE;
1284 switch (nir_intrinsic_memory_scope(intrin)) {
1285 case NIR_SCOPE_INVOCATION:
1286 /* a barier should never be required for correctness with these scopes */
1287 modes = 0;
1288 break;
1289 default:
1290 break;
1291 }
1292 break;
1293 default:
1294 return false;
1295 }
1296 } else if (instr->type == nir_instr_type_call) {
1297 modes = nir_var_all;
1298 } else {
1299 return false;
1300 }
1301
1302 while (modes) {
1303 unsigned mode_index = u_bit_scan(&modes);
1304 if ((1 << mode_index) == nir_var_mem_global) {
1305 /* Global should be rolled in with SSBO */
1306 assert(list_is_empty(&ctx->entries[mode_index]));
1307 assert(ctx->loads[mode_index] == NULL);
1308 assert(ctx->stores[mode_index] == NULL);
1309 continue;
1310 }
1311
1312 if (acquire)
1313 *progress |= vectorize_entries(ctx, impl, ctx->loads[mode_index]);
1314 if (release)
1315 *progress |= vectorize_entries(ctx, impl, ctx->stores[mode_index]);
1316 }
1317
1318 return true;
1319 }
1320
1321 static bool
process_block(nir_function_impl * impl,struct vectorize_ctx * ctx,nir_block * block)1322 process_block(nir_function_impl *impl, struct vectorize_ctx *ctx, nir_block *block)
1323 {
1324 bool progress = false;
1325
1326 for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1327 list_inithead(&ctx->entries[i]);
1328 if (ctx->loads[i])
1329 _mesa_hash_table_clear(ctx->loads[i], delete_entry_dynarray);
1330 if (ctx->stores[i])
1331 _mesa_hash_table_clear(ctx->stores[i], delete_entry_dynarray);
1332 }
1333
1334 /* create entries */
1335 unsigned next_index = 0;
1336
1337 nir_foreach_instr_safe(instr, block) {
1338 if (handle_barrier(ctx, &progress, impl, instr))
1339 continue;
1340
1341 /* gather information */
1342 if (instr->type != nir_instr_type_intrinsic)
1343 continue;
1344 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1345
1346 const struct intrinsic_info *info = get_info(intrin->intrinsic);
1347 if (!info)
1348 continue;
1349
1350 nir_variable_mode mode = info->mode;
1351 if (!mode)
1352 mode = nir_src_as_deref(intrin->src[info->deref_src])->modes;
1353 if (!(mode & aliasing_modes(ctx->options->modes)))
1354 continue;
1355 unsigned mode_index = mode_to_index(mode);
1356
1357 /* create entry */
1358 struct entry *entry = create_entry(ctx, info, intrin);
1359 entry->index = next_index++;
1360
1361 list_addtail(&entry->head, &ctx->entries[mode_index]);
1362
1363 /* add the entry to a hash table */
1364
1365 struct hash_table *adj_ht = NULL;
1366 if (entry->is_store) {
1367 if (!ctx->stores[mode_index])
1368 ctx->stores[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1369 adj_ht = ctx->stores[mode_index];
1370 } else {
1371 if (!ctx->loads[mode_index])
1372 ctx->loads[mode_index] = _mesa_hash_table_create(ctx, &hash_entry_key, &entry_key_equals);
1373 adj_ht = ctx->loads[mode_index];
1374 }
1375
1376 uint32_t key_hash = hash_entry_key(entry->key);
1377 struct hash_entry *adj_entry = _mesa_hash_table_search_pre_hashed(adj_ht, key_hash, entry->key);
1378 struct util_dynarray *arr;
1379 if (adj_entry && adj_entry->data) {
1380 arr = (struct util_dynarray *)adj_entry->data;
1381 } else {
1382 arr = ralloc(ctx, struct util_dynarray);
1383 util_dynarray_init(arr, arr);
1384 _mesa_hash_table_insert_pre_hashed(adj_ht, key_hash, entry->key, arr);
1385 }
1386 util_dynarray_append(arr, struct entry *, entry);
1387 }
1388
1389 /* sort and combine entries */
1390 for (unsigned i = 0; i < nir_num_variable_modes; i++) {
1391 progress |= vectorize_entries(ctx, impl, ctx->loads[i]);
1392 progress |= vectorize_entries(ctx, impl, ctx->stores[i]);
1393 }
1394
1395 return progress;
1396 }
1397
1398 bool
nir_opt_load_store_vectorize(nir_shader * shader,const nir_load_store_vectorize_options * options)1399 nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options)
1400 {
1401 bool progress = false;
1402
1403 struct vectorize_ctx *ctx = rzalloc(NULL, struct vectorize_ctx);
1404 ctx->shader = shader;
1405 ctx->options = options;
1406
1407 nir_shader_index_vars(shader, options->modes);
1408
1409 nir_foreach_function(function, shader) {
1410 if (function->impl) {
1411 if (options->modes & nir_var_function_temp)
1412 nir_function_impl_index_vars(function->impl);
1413
1414 nir_foreach_block(block, function->impl)
1415 progress |= process_block(function->impl, ctx, block);
1416
1417 nir_metadata_preserve(function->impl,
1418 nir_metadata_block_index |
1419 nir_metadata_dominance |
1420 nir_metadata_live_ssa_defs);
1421 }
1422 }
1423
1424 ralloc_free(ctx);
1425 return progress;
1426 }
1427