1 /*
2  * Copyright (C) 2020 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler.h"
25 
26 /* NIR creates vectors as vecN ops, which we represent by a synthetic
27  * BI_COMBINE instruction, e.g.:
28  *
29  *      v = combine x, y, z, w
30  *
31  * These combines need to be lowered by the pass in this file. Fix a given
32  * source at component c.
33  *
34  * First suppose the source is SSA. If it is also scalar, then we may rewrite
35  * the destination of the generating instruction (unique by SSA+scalar) to
36  * write to v.c, and rewrite each of its uses to swizzle out .c instead of .x
37  * (the original by scalar). If it is vector, there are two cases. If the
38  * component c is `x`, we are accessing v.x, and each of the succeeding
39  * components y, z... up to the last component of the vector are accessed
40  * sequentially, then we may perform the same rewrite. If this is not the case,
41  * rewriting would require more complex vector features, so we fallback on a
42  * move.
43  *
44  * Otherwise is the source is not SSA, we also fallback on a move. We could
45  * probably do better.
46  */
47 
48 static void
bi_combine_mov32(bi_context * ctx,bi_instruction * parent,unsigned comp,unsigned R)49 bi_combine_mov32(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
50 {
51         bi_instruction move = {
52                 .type = BI_MOV,
53                 .dest = R,
54                 .dest_type = nir_type_uint32,
55                 .dest_offset = comp,
56                 .src = { parent->src[comp] },
57                 .src_types = { nir_type_uint32 },
58                 .swizzle = { { parent->swizzle[comp][0] } }
59         };
60 
61         bi_emit_before(ctx, parent, move);
62 }
63 
64 static void
bi_combine_sel16(bi_context * ctx,bi_instruction * parent,unsigned comp,unsigned R)65 bi_combine_sel16(bi_context *ctx, bi_instruction *parent, unsigned comp, unsigned R)
66 {
67         bi_instruction sel = {
68                 .type = BI_SELECT,
69                 .dest = R,
70                 .dest_type = nir_type_uint32,
71                 .dest_offset = comp >> 1,
72                 .src = { parent->src[comp], parent->src[comp + 1] },
73                 .src_types = { nir_type_uint16, nir_type_uint16 },
74                 .swizzle = {
75                         { parent->swizzle[comp][0] },
76                         { parent->swizzle[comp + 1][0] },
77                 }
78         };
79 
80         /* In case we have a combine from a vec3 */
81         if (!sel.src[1])
82                 sel.src[1] = BIR_INDEX_ZERO;
83 
84         bi_emit_before(ctx, parent, sel);
85 }
86 
87 /* Gets the instruction generating a given source. Combine lowering is
88  * accidentally O(n^2) right now because this function is O(n) instead of O(1).
89  * If this pass is slow, this cost can be avoided in favour for better
90  * bookkeeping. */
91 
92 #if 0
93 static bi_instruction *
94 bi_get_parent(bi_context *ctx, unsigned idx)
95 {
96         bi_foreach_instr_global(ctx, ins) {
97                 if (ins->dest == idx)
98                         return ins;
99         }
100 
101         return NULL;
102 }
103 #endif
104 
105 /* Rewrites uses of an index. Again, this could be O(n) to the program but is
106  * currently O(nc) to the program and number of combines, so the pass becomes
107  * effectively O(n^2). Better bookkeeping would bring down to linear if that's
108  * an issue. */
109 
110 static void
bi_rewrite_uses(bi_context * ctx,unsigned old,unsigned oldc,unsigned new,unsigned newc)111 bi_rewrite_uses(bi_context *ctx,
112                 unsigned old, unsigned oldc,
113                 unsigned new, unsigned newc)
114 {
115         bi_foreach_instr_global(ctx, ins) {
116                 bi_foreach_src(ins, s) {
117                         if (ins->src[s] != old) continue;
118 
119                         for (unsigned i = 0; i < 16; ++i)
120                                 ins->swizzle[s][i] += (newc - oldc);
121 
122                         ins->src[s] = new;
123                 }
124         }
125 }
126 
127 /* Checks if we have a nicely aligned vector prefix */
128 
129 #if 0
130 static bool
131 bi_is_aligned_vec32(bi_instruction *combine, unsigned s, bi_instruction *io,
132                 unsigned *count)
133 {
134         /* We only support prefixes */
135         if (s != 0)
136                 return false;
137 
138         if (!(bi_class_props[io->type] & BI_VECTOR))
139                 return false;
140 
141         if (nir_alu_type_get_type_size(combine->dest_type) != 32)
142                 return false;
143 
144         if (nir_alu_type_get_type_size(io->dest_type) != 32)
145                 return false;
146 
147         unsigned components = io->vector_channels;
148 
149         /* Are we contiguous like that? */
150 
151         for (unsigned i = 0; i < components; ++i) {
152                 if (combine->src[i] != io->dest)
153                         return false;
154 
155                 if (combine->swizzle[i][0] != i)
156                         return false;
157         }
158 
159         /* We're good to go */
160         *count = components;
161         return true;
162 }
163 
164 /* Tries to lower a given source of a combine to an appropriate rewrite,
165  * returning true if successful, and false with no changes otherwise. */
166 
167 static bool
168 bi_lower_combine_src(bi_context *ctx, bi_instruction *ins, unsigned s, unsigned R,
169                 unsigned *vec_count)
170 {
171         unsigned src = ins->src[s];
172 
173         /* We currently only handle SSA */
174 
175         if (!src) return false;
176         if (src & (BIR_SPECIAL | PAN_IS_REG)) return false;
177 
178         /* We are SSA. Lookup the generating instruction. */
179         unsigned bytes = nir_alu_type_get_type_size(ins->dest_type) / 8;
180 
181         bi_instruction *parent = bi_get_parent(ctx, src,
182                          0xF << (ins->swizzle[s][0] * bytes));
183 
184         if (!parent) return false;
185 
186         /* We have a parent instuction, sanity check the typesize */
187         unsigned pbytes = nir_alu_type_get_type_size(parent->dest_type) / 8;
188         if (pbytes != bytes) return false;
189 
190         bool scalar = parent->vector_channels != 0;
191         if (!(scalar || bi_is_aligned_vec(ins, s, parent, vec_count))) return false;
192 
193         if (!bi_shift_mask(parent, bytes * s)) return false;
194         bi_rewrite_uses(ctx, parent->dest, 0, R, s);
195         parent->dest = R;
196         return true;
197 }
198 #endif
199 
200 void
bi_lower_combine(bi_context * ctx,bi_block * block)201 bi_lower_combine(bi_context *ctx, bi_block *block)
202 {
203         bi_foreach_instr_in_block_safe(block, ins) {
204                 if (ins->type != BI_COMBINE) continue;
205 
206                 bool needs_rewrite = !(ins->dest & PAN_IS_REG);
207                 unsigned R = needs_rewrite ? bi_make_temp_reg(ctx) : ins->dest;
208                 unsigned sz = nir_alu_type_get_type_size(ins->dest_type);
209 
210                 bi_foreach_src(ins, s) {
211                         /* We're done early for vec2/3 */
212                         if (!ins->src[s])
213                                 continue;
214 
215 #if 0
216                         unsigned vec_count = 0;
217 
218                         if (bi_lower_combine_src(ctx, ins, s, R, &vec_count)) {
219                                 /* Skip vectored sources */
220                                 if (vec_count)
221                                         s += (vec_count - 1);
222                         } else {
223                                 bi_insert_combine_mov(ctx, ins, s, R);
224                         }
225 #endif
226                         if (sz == 32)
227                                 bi_combine_mov32(ctx, ins, s, R);
228                         else if (sz == 16) {
229                                 bi_combine_sel16(ctx, ins, s, R);
230                                 s++;
231                         } else {
232                                 unreachable("Unknown COMBINE size");
233                         }
234                 }
235 
236                 if (needs_rewrite)
237                         bi_rewrite_uses(ctx, ins->dest, 0, R, 0);
238 
239                 bi_remove_instruction(ins);
240         }
241 }
242