1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "util/u_math.h"
28 
29 namespace brw {
30 
vec4_instruction(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32                                    const src_reg &src0, const src_reg &src1,
33                                    const src_reg &src2)
34 {
35    this->opcode = opcode;
36    this->dst = dst;
37    this->src[0] = src0;
38    this->src[1] = src1;
39    this->src[2] = src2;
40    this->saturate = false;
41    this->force_writemask_all = false;
42    this->no_dd_clear = false;
43    this->no_dd_check = false;
44    this->writes_accumulator = false;
45    this->conditional_mod = BRW_CONDITIONAL_NONE;
46    this->predicate = BRW_PREDICATE_NONE;
47    this->predicate_inverse = false;
48    this->target = 0;
49    this->shadow_compare = false;
50    this->eot = false;
51    this->ir = NULL;
52    this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53    this->header_size = 0;
54    this->flag_subreg = 0;
55    this->mlen = 0;
56    this->base_mrf = 0;
57    this->offset = 0;
58    this->exec_size = 8;
59    this->group = 0;
60    this->size_written = (dst.file == BAD_FILE ?
61                          0 : this->exec_size * type_sz(dst.type));
62    this->annotation = NULL;
63 }
64 
65 vec4_instruction *
emit(vec4_instruction * inst)66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68    inst->ir = this->base_ir;
69    inst->annotation = this->current_annotation;
70 
71    this->instructions.push_tail(inst);
72 
73    return inst;
74 }
75 
76 vec4_instruction *
emit_before(bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78                           vec4_instruction *new_inst)
79 {
80    new_inst->ir = inst->ir;
81    new_inst->annotation = inst->annotation;
82 
83    inst->insert_before(block, new_inst);
84 
85    return inst;
86 }
87 
88 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90                    const src_reg &src1, const src_reg &src2)
91 {
92    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94 
95 
96 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98                    const src_reg &src1)
99 {
100    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102 
103 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108 
109 vec4_instruction *
emit(enum opcode opcode,const dst_reg & dst)110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111 {
112    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114 
115 vec4_instruction *
emit(enum opcode opcode)116 vec4_visitor::emit(enum opcode opcode)
117 {
118    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120 
121 #define ALU1(op)							\
122    vec4_instruction *							\
123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
124    {									\
125       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126    }
127 
128 #define ALU2(op)							\
129    vec4_instruction *							\
130    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
131                     const src_reg &src1)				\
132    {									\
133       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
134                                            src0, src1);                 \
135    }
136 
137 #define ALU2_ACC(op)							\
138    vec4_instruction *							\
139    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
140                     const src_reg &src1)				\
141    {									\
142       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
143                        BRW_OPCODE_##op, dst, src0, src1);		\
144       inst->writes_accumulator = true;                                  \
145       return inst;                                                      \
146    }
147 
148 #define ALU3(op)							\
149    vec4_instruction *							\
150    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
151                     const src_reg &src1, const src_reg &src2)		\
152    {									\
153       assert(devinfo->ver >= 6);						\
154       return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
155 					   src0, src1, src2);		\
156    }
157 
158 ALU1(NOT)
ALU1(MOV)159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU3(MAD)
187 ALU2_ACC(ADDC)
188 ALU2_ACC(SUBB)
189 ALU2(MAC)
190 ALU1(DIM)
191 
192 /** Gfx4 predicated IF. */
193 vec4_instruction *
194 vec4_visitor::IF(enum brw_predicate predicate)
195 {
196    vec4_instruction *inst;
197 
198    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199    inst->predicate = predicate;
200 
201    return inst;
202 }
203 
204 /** Gfx6 IF with embedded comparison. */
205 vec4_instruction *
IF(src_reg src0,src_reg src1,enum brw_conditional_mod condition)206 vec4_visitor::IF(src_reg src0, src_reg src1,
207                  enum brw_conditional_mod condition)
208 {
209    assert(devinfo->ver == 6);
210 
211    vec4_instruction *inst;
212 
213    resolve_ud_negate(&src0);
214    resolve_ud_negate(&src1);
215 
216    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217 					src0, src1);
218    inst->conditional_mod = condition;
219 
220    return inst;
221 }
222 
223 /**
224  * CMP: Sets the low bit of the destination channels with the result
225  * of the comparison, while the upper bits are undefined, and updates
226  * the flag register with the packed 16 bits of the result.
227  */
228 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum brw_conditional_mod condition)229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230                   enum brw_conditional_mod condition)
231 {
232    vec4_instruction *inst;
233 
234    /* Take the instruction:
235     *
236     * CMP null<d> src0<f> src1<f>
237     *
238     * Original gfx4 does type conversion to the destination type before
239     * comparison, producing garbage results for floating point comparisons.
240     *
241     * The destination type doesn't matter on newer generations, so we set the
242     * type to match src0 so we can compact the instruction.
243     */
244    dst.type = src0.type;
245 
246    resolve_ud_negate(&src0);
247    resolve_ud_negate(&src1);
248 
249    inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250    inst->conditional_mod = condition;
251 
252    return inst;
253 }
254 
255 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257 {
258    vec4_instruction *inst;
259 
260    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ,
261 					dst, index);
262    inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1;
263    inst->mlen = 2;
264 
265    return inst;
266 }
267 
268 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270                             const src_reg &index)
271 {
272    vec4_instruction *inst;
273 
274    inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE,
275 					dst, src, index);
276    inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver);
277    inst->mlen = 3;
278 
279    return inst;
280 }
281 
282 src_reg
fix_3src_operand(const src_reg & src)283 vec4_visitor::fix_3src_operand(const src_reg &src)
284 {
285    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286     * able to use vertical stride of zero to replicate the vec4 uniform, like
287     *
288     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289     *
290     * But you can't, since vertical stride is always four in three-source
291     * instructions. Instead, insert a MOV instruction to do the replication so
292     * that the three-source instruction can consume it.
293     */
294 
295    /* The MOV is only needed if the source is a uniform or immediate. */
296    if (src.file != UNIFORM && src.file != IMM)
297       return src;
298 
299    if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300       return src;
301 
302    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303    expanded.type = src.type;
304    emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305    return src_reg(expanded);
306 }
307 
308 src_reg
fix_math_operand(const src_reg & src)309 vec4_visitor::fix_math_operand(const src_reg &src)
310 {
311    if (devinfo->ver < 6 || src.file == BAD_FILE)
312       return src;
313 
314    /* The gfx6 math instruction ignores the source modifiers --
315     * swizzle, abs, negate, and at least some parts of the register
316     * region description.
317     *
318     * Rather than trying to enumerate all these cases, *always* expand the
319     * operand to a temp GRF for gfx6.
320     *
321     * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
322     * can't use.
323     */
324 
325    if (devinfo->ver == 7 && src.file != IMM)
326       return src;
327 
328    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
329    expanded.type = src.type;
330    emit(MOV(expanded, src));
331    return src_reg(expanded);
332 }
333 
334 vec4_instruction *
emit_math(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)335 vec4_visitor::emit_math(enum opcode opcode,
336                         const dst_reg &dst,
337                         const src_reg &src0, const src_reg &src1)
338 {
339    vec4_instruction *math =
340       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
341 
342    if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) {
343       /* MATH on Gfx6 must be align1, so we can't do writemasks. */
344       math->dst = dst_reg(this, glsl_type::vec4_type);
345       math->dst.type = dst.type;
346       math = emit(MOV(dst, src_reg(math->dst)));
347    } else if (devinfo->ver < 6) {
348       math->base_mrf = 1;
349       math->mlen = src1.file == BAD_FILE ? 1 : 2;
350    }
351 
352    return math;
353 }
354 
355 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)356 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
357 {
358    if (devinfo->ver < 7) {
359       unreachable("ir_unop_pack_half_2x16 should be lowered");
360    }
361 
362    assert(dst.type == BRW_REGISTER_TYPE_UD);
363    assert(src0.type == BRW_REGISTER_TYPE_F);
364 
365    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
366     *
367     *   Because this instruction does not have a 16-bit floating-point type,
368     *   the destination data type must be Word (W).
369     *
370     *   The destination must be DWord-aligned and specify a horizontal stride
371     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
372     *   each destination channel and the upper word is not modified.
373     *
374     * The above restriction implies that the f32to16 instruction must use
375     * align1 mode, because only in align1 mode is it possible to specify
376     * horizontal stride.  We choose here to defy the hardware docs and emit
377     * align16 instructions.
378     *
379     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
380     * instructions. I was partially successful in that the code passed all
381     * tests.  However, the code was dubiously correct and fragile, and the
382     * tests were not harsh enough to probe that frailty. Not trusting the
383     * code, I chose instead to remain in align16 mode in defiance of the hw
384     * docs).
385     *
386     * I've [chadv] experimentally confirmed that, on gfx7 hardware and the
387     * simulator, emitting a f32to16 in align16 mode with UD as destination
388     * data type is safe. The behavior differs from that specified in the PRM
389     * in that the upper word of each destination channel is cleared to 0.
390     */
391 
392    dst_reg tmp_dst(this, glsl_type::uvec2_type);
393    src_reg tmp_src(tmp_dst);
394 
395 #if 0
396    /* Verify the undocumented behavior on which the following instructions
397     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
398     * then the result of the bit-or instruction below will be incorrect.
399     *
400     * You should inspect the disasm output in order to verify that the MOV is
401     * not optimized away.
402     */
403    emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
404 #endif
405 
406    /* Give tmp the form below, where "." means untouched.
407     *
408     *     w z          y          x w z          y          x
409     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
410     *
411     * That the upper word of each write-channel be 0 is required for the
412     * following bit-shift and bit-or instructions to work. Note that this
413     * relies on the undocumented hardware behavior mentioned above.
414     */
415    tmp_dst.writemask = WRITEMASK_XY;
416    emit(F32TO16(tmp_dst, src0));
417 
418    /* Give the write-channels of dst the form:
419     *   0xhhhh0000
420     */
421    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
422    emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
423 
424    /* Finally, give the write-channels of dst the form of packHalf2x16's
425     * output:
426     *   0xhhhhllll
427     */
428    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
429    emit(OR(dst, src_reg(dst), tmp_src));
430 }
431 
432 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)433 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
434 {
435    if (devinfo->ver < 7) {
436       unreachable("ir_unop_unpack_half_2x16 should be lowered");
437    }
438 
439    assert(dst.type == BRW_REGISTER_TYPE_F);
440    assert(src0.type == BRW_REGISTER_TYPE_UD);
441 
442    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
443     *
444     *   Because this instruction does not have a 16-bit floating-point type,
445     *   the source data type must be Word (W). The destination type must be
446     *   F (Float).
447     *
448     * To use W as the source data type, we must adjust horizontal strides,
449     * which is only possible in align1 mode. All my [chadv] attempts at
450     * emitting align1 instructions for unpackHalf2x16 failed to pass the
451     * Piglit tests, so I gave up.
452     *
453     * I've verified that, on gfx7 hardware and the simulator, it is safe to
454     * emit f16to32 in align16 mode with UD as source data type.
455     */
456 
457    dst_reg tmp_dst(this, glsl_type::uvec2_type);
458    src_reg tmp_src(tmp_dst);
459 
460    tmp_dst.writemask = WRITEMASK_X;
461    emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
462 
463    tmp_dst.writemask = WRITEMASK_Y;
464    emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
465 
466    dst.writemask = WRITEMASK_XY;
467    emit(F16TO32(dst, tmp_src));
468 }
469 
470 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)471 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
472 {
473    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
474     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
475     * is not suitable to generate the shift values, but we can use the packed
476     * vector float and a type-converting MOV.
477     */
478    dst_reg shift(this, glsl_type::uvec4_type);
479    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
480 
481    dst_reg shifted(this, glsl_type::uvec4_type);
482    src0.swizzle = BRW_SWIZZLE_XXXX;
483    emit(SHR(shifted, src0, src_reg(shift)));
484 
485    shifted.type = BRW_REGISTER_TYPE_UB;
486    dst_reg f(this, glsl_type::vec4_type);
487    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
488 
489    emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
490 }
491 
492 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)493 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497     * is not suitable to generate the shift values, but we can use the packed
498     * vector float and a type-converting MOV.
499     */
500    dst_reg shift(this, glsl_type::uvec4_type);
501    emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
502 
503    dst_reg shifted(this, glsl_type::uvec4_type);
504    src0.swizzle = BRW_SWIZZLE_XXXX;
505    emit(SHR(shifted, src0, src_reg(shift)));
506 
507    shifted.type = BRW_REGISTER_TYPE_B;
508    dst_reg f(this, glsl_type::vec4_type);
509    emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510 
511    dst_reg scaled(this, glsl_type::vec4_type);
512    emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
513 
514    dst_reg max(this, glsl_type::vec4_type);
515    emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
516    emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
517 }
518 
519 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)520 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
521 {
522    dst_reg saturated(this, glsl_type::vec4_type);
523    vec4_instruction *inst = emit(MOV(saturated, src0));
524    inst->saturate = true;
525 
526    dst_reg scaled(this, glsl_type::vec4_type);
527    emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
528 
529    dst_reg rounded(this, glsl_type::vec4_type);
530    emit(RNDE(rounded, src_reg(scaled)));
531 
532    dst_reg u(this, glsl_type::uvec4_type);
533    emit(MOV(u, src_reg(rounded)));
534 
535    src_reg bytes(u);
536    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
537 }
538 
539 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)540 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542    dst_reg max(this, glsl_type::vec4_type);
543    emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
544 
545    dst_reg min(this, glsl_type::vec4_type);
546    emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
547 
548    dst_reg scaled(this, glsl_type::vec4_type);
549    emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
550 
551    dst_reg rounded(this, glsl_type::vec4_type);
552    emit(RNDE(rounded, src_reg(scaled)));
553 
554    dst_reg i(this, glsl_type::ivec4_type);
555    emit(MOV(i, src_reg(rounded)));
556 
557    src_reg bytes(i);
558    emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560 
561 /*
562  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
563  * false) elements needed to pack a type.
564  */
565 static int
type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)566 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
567 {
568    unsigned int i;
569    int size;
570 
571    switch (type->base_type) {
572    case GLSL_TYPE_UINT:
573    case GLSL_TYPE_INT:
574    case GLSL_TYPE_FLOAT:
575    case GLSL_TYPE_FLOAT16:
576    case GLSL_TYPE_BOOL:
577    case GLSL_TYPE_DOUBLE:
578    case GLSL_TYPE_UINT16:
579    case GLSL_TYPE_INT16:
580    case GLSL_TYPE_UINT8:
581    case GLSL_TYPE_INT8:
582    case GLSL_TYPE_UINT64:
583    case GLSL_TYPE_INT64:
584       if (type->is_matrix()) {
585          const glsl_type *col_type = type->column_type();
586          unsigned col_slots =
587             (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
588          return type->matrix_columns * col_slots;
589       } else {
590          /* Regardless of size of vector, it gets a vec4. This is bad
591           * packing for things like floats, but otherwise arrays become a
592           * mess.  Hopefully a later pass over the code can pack scalars
593           * down if appropriate.
594           */
595          return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
596       }
597    case GLSL_TYPE_ARRAY:
598       assert(type->length > 0);
599       return type_size_xvec4(type->fields.array, as_vec4, bindless) *
600              type->length;
601    case GLSL_TYPE_STRUCT:
602    case GLSL_TYPE_INTERFACE:
603       size = 0;
604       for (i = 0; i < type->length; i++) {
605 	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
606                                  bindless);
607       }
608       return size;
609    case GLSL_TYPE_SUBROUTINE:
610       return 1;
611 
612    case GLSL_TYPE_SAMPLER:
613       /* Samplers take up no register space, since they're baked in at
614        * link time.
615        */
616       return bindless ? 1 : 0;
617    case GLSL_TYPE_ATOMIC_UINT:
618       return 0;
619    case GLSL_TYPE_IMAGE:
620       return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
621    case GLSL_TYPE_VOID:
622    case GLSL_TYPE_ERROR:
623    case GLSL_TYPE_FUNCTION:
624       unreachable("not reached");
625    }
626 
627    return 0;
628 }
629 
630 /**
631  * Returns the minimum number of vec4 elements needed to pack a type.
632  *
633  * For simple types, it will return 1 (a single vec4); for matrices, the
634  * number of columns; for array and struct, the sum of the vec4_size of
635  * each of its elements; and for sampler and atomic, zero.
636  *
637  * This method is useful to calculate how much register space is needed to
638  * store a particular type.
639  */
640 extern "C" int
type_size_vec4(const struct glsl_type * type,bool bindless)641 type_size_vec4(const struct glsl_type *type, bool bindless)
642 {
643    return type_size_xvec4(type, true, bindless);
644 }
645 
646 /**
647  * Returns the minimum number of dvec4 elements needed to pack a type.
648  *
649  * For simple types, it will return 1 (a single dvec4); for matrices, the
650  * number of columns; for array and struct, the sum of the dvec4_size of
651  * each of its elements; and for sampler and atomic, zero.
652  *
653  * This method is useful to calculate how much register space is needed to
654  * store a particular type.
655  *
656  * Measuring double-precision vertex inputs as dvec4 is required because
657  * ARB_vertex_attrib_64bit states that these uses the same number of locations
658  * than the single-precision version. That is, two consecutives dvec4 would be
659  * located in location "x" and location "x+1", not "x+2".
660  *
661  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
662  * remap_vs_attrs() will take in account both the location and also if the
663  * type fits in one or two vec4 slots.
664  */
665 extern "C" int
type_size_dvec4(const struct glsl_type * type,bool bindless)666 type_size_dvec4(const struct glsl_type *type, bool bindless)
667 {
668    return type_size_xvec4(type, false, bindless);
669 }
670 
src_reg(class vec4_visitor * v,const struct glsl_type * type)671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
672 {
673    init();
674 
675    this->file = VGRF;
676    this->nr = v->alloc.allocate(type_size_vec4(type, false));
677 
678    if (type->is_array() || type->is_struct()) {
679       this->swizzle = BRW_SWIZZLE_NOOP;
680    } else {
681       this->swizzle = brw_swizzle_for_size(type->vector_elements);
682    }
683 
684    this->type = brw_type_for_base_type(type);
685 }
686 
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)687 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
688 {
689    assert(size > 0);
690 
691    init();
692 
693    this->file = VGRF;
694    this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
695 
696    this->swizzle = BRW_SWIZZLE_NOOP;
697 
698    this->type = brw_type_for_base_type(type);
699 }
700 
dst_reg(class vec4_visitor * v,const struct glsl_type * type)701 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
702 {
703    init();
704 
705    this->file = VGRF;
706    this->nr = v->alloc.allocate(type_size_vec4(type, false));
707 
708    if (type->is_array() || type->is_struct()) {
709       this->writemask = WRITEMASK_XYZW;
710    } else {
711       this->writemask = (1 << type->vector_elements) - 1;
712    }
713 
714    this->type = brw_type_for_base_type(type);
715 }
716 
717 vec4_instruction *
emit_minmax(enum brw_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)718 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
719                           src_reg src0, src_reg src1)
720 {
721    vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
722    inst->conditional_mod = conditionalmod;
723    return inst;
724 }
725 
726 /**
727  * Emits the instructions needed to perform a pull constant load. before_block
728  * and before_inst can be NULL in which case the instruction will be appended
729  * to the end of the instruction list.
730  */
731 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,bblock_t * before_block,vec4_instruction * before_inst)732 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
733                                           src_reg surf_index,
734                                           src_reg offset_reg,
735                                           bblock_t *before_block,
736                                           vec4_instruction *before_inst)
737 {
738    assert((before_inst == NULL && before_block == NULL) ||
739           (before_inst && before_block));
740 
741    vec4_instruction *pull;
742 
743    if (devinfo->ver >= 7) {
744       dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
745 
746       grf_offset.type = offset_reg.type;
747 
748       pull = MOV(grf_offset, offset_reg);
749 
750       if (before_inst)
751          emit_before(before_block, before_inst, pull);
752       else
753          emit(pull);
754 
755       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
756                                            dst,
757                                            surf_index,
758                                            src_reg(grf_offset));
759       pull->mlen = 1;
760    } else {
761       pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
762                                            dst,
763                                            surf_index,
764                                            offset_reg);
765       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
766       pull->mlen = 1;
767    }
768 
769    if (before_inst)
770       emit_before(before_block, before_inst, pull);
771    else
772       emit(pull);
773 }
774 
775 src_reg
emit_uniformize(const src_reg & src)776 vec4_visitor::emit_uniformize(const src_reg &src)
777 {
778    const src_reg chan_index(this, glsl_type::uint_type);
779    const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
780                               src.type);
781 
782    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
783       ->force_writemask_all = true;
784    emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
785       ->force_writemask_all = true;
786 
787    return src_reg(dst);
788 }
789 
790 src_reg
emit_mcs_fetch(const glsl_type * coordinate_type,src_reg coordinate,src_reg surface)791 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
792                              src_reg coordinate, src_reg surface)
793 {
794    vec4_instruction *inst =
795       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
796                                     dst_reg(this, glsl_type::uvec4_type));
797    inst->base_mrf = 2;
798    inst->src[1] = surface;
799    inst->src[2] = brw_imm_ud(0); /* sampler */
800    inst->mlen = 1;
801 
802    const int param_base = inst->base_mrf;
803 
804    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
805    int coord_mask = (1 << coordinate_type->vector_elements) - 1;
806    int zero_mask = 0xf & ~coord_mask;
807 
808    emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
809             coordinate));
810 
811    emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
812             brw_imm_d(0)));
813 
814    emit(inst);
815    return src_reg(inst->dst);
816 }
817 
818 bool
is_high_sampler(src_reg sampler)819 vec4_visitor::is_high_sampler(src_reg sampler)
820 {
821    if (!devinfo->is_haswell)
822       return false;
823 
824    return sampler.file != IMM || sampler.ud >= 16;
825 }
826 
827 void
emit_texture(ir_texture_opcode op,dst_reg dest,int dest_components,src_reg coordinate,int coord_components,src_reg shadow_comparator,src_reg lod,src_reg lod2,src_reg sample_index,uint32_t constant_offset,src_reg offset_value,src_reg mcs,uint32_t surface,src_reg surface_reg,src_reg sampler_reg)828 vec4_visitor::emit_texture(ir_texture_opcode op,
829                            dst_reg dest,
830                            int dest_components,
831                            src_reg coordinate,
832                            int coord_components,
833                            src_reg shadow_comparator,
834                            src_reg lod, src_reg lod2,
835                            src_reg sample_index,
836                            uint32_t constant_offset,
837                            src_reg offset_value,
838                            src_reg mcs,
839                            uint32_t surface,
840                            src_reg surface_reg,
841                            src_reg sampler_reg)
842 {
843    enum opcode opcode;
844    switch (op) {
845    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
846    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
847    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
848    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
849    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
850    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
851    case ir_tg4: opcode = offset_value.file != BAD_FILE
852                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
853    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
854    case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
855    case ir_txb:
856       unreachable("TXB is not valid for vertex shaders.");
857    case ir_lod:
858       unreachable("LOD is not valid for vertex shaders.");
859    case ir_samples_identical: {
860       /* There are some challenges implementing this for vec4, and it seems
861        * unlikely to be used anyway.  For now, just return false ways.
862        */
863       emit(MOV(dest, brw_imm_ud(0u)));
864       return;
865    }
866    default:
867       unreachable("Unrecognized tex op");
868    }
869 
870    vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
871 
872    inst->offset = constant_offset;
873 
874    /* The message header is necessary for:
875     * - Gfx4 (always)
876     * - Texel offsets
877     * - Gather channel selection
878     * - Sampler indices too large to fit in a 4-bit value.
879     * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
880     */
881    inst->header_size =
882       (devinfo->ver < 5 ||
883        inst->offset != 0 || op == ir_tg4 ||
884        op == ir_texture_samples ||
885        is_high_sampler(sampler_reg)) ? 1 : 0;
886    inst->base_mrf = 2;
887    inst->mlen = inst->header_size;
888    inst->dst.writemask = WRITEMASK_XYZW;
889    inst->shadow_compare = shadow_comparator.file != BAD_FILE;
890 
891    inst->src[1] = surface_reg;
892    inst->src[2] = sampler_reg;
893 
894    /* MRF for the first parameter */
895    int param_base = inst->base_mrf + inst->header_size;
896 
897    if (op == ir_txs || op == ir_query_levels) {
898       int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X;
899       emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
900       inst->mlen++;
901    } else if (op == ir_texture_samples) {
902       inst->dst.writemask = WRITEMASK_X;
903    } else {
904       /* Load the coordinate */
905       /* FINISHME: gl_clamp_mask and saturate */
906       int coord_mask = (1 << coord_components) - 1;
907       int zero_mask = 0xf & ~coord_mask;
908 
909       emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
910                coordinate));
911       inst->mlen++;
912 
913       if (zero_mask != 0) {
914          emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
915                   brw_imm_d(0)));
916       }
917       /* Load the shadow comparator */
918       if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
919 	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
920 			  WRITEMASK_X),
921 		  shadow_comparator));
922 	 inst->mlen++;
923       }
924 
925       /* Load the LOD info */
926       if (op == ir_tex || op == ir_txl) {
927 	 int mrf, writemask;
928 	 if (devinfo->ver >= 5) {
929 	    mrf = param_base + 1;
930 	    if (shadow_comparator.file != BAD_FILE) {
931 	       writemask = WRITEMASK_Y;
932 	       /* mlen already incremented */
933 	    } else {
934 	       writemask = WRITEMASK_X;
935 	       inst->mlen++;
936 	    }
937 	 } else /* devinfo->ver == 4 */ {
938 	    mrf = param_base;
939 	    writemask = WRITEMASK_W;
940 	 }
941 	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
942       } else if (op == ir_txf) {
943          emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
944       } else if (op == ir_txf_ms) {
945          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
946                   sample_index));
947          if (devinfo->ver >= 7) {
948             /* MCS data is in the first channel of `mcs`, but we need to get it into
949              * the .y channel of the second vec4 of params, so replicate .x across
950              * the whole vec4 and then mask off everything except .y
951              */
952             mcs.swizzle = BRW_SWIZZLE_XXXX;
953             emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
954                      mcs));
955          }
956          inst->mlen++;
957       } else if (op == ir_txd) {
958          const brw_reg_type type = lod.type;
959 
960 	 if (devinfo->ver >= 5) {
961 	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
962 	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
963 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
964 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
965 	    inst->mlen++;
966 
967 	    if (dest_components == 3 || shadow_comparator.file != BAD_FILE) {
968 	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
969 	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
970 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
971 	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
972 	       inst->mlen++;
973 
974                if (shadow_comparator.file != BAD_FILE) {
975                   emit(MOV(dst_reg(MRF, param_base + 2,
976                                    shadow_comparator.type, WRITEMASK_Z),
977                            shadow_comparator));
978                }
979 	    }
980 	 } else /* devinfo->ver == 4 */ {
981 	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
982 	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
983 	    inst->mlen += 2;
984 	 }
985       } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
986          if (shadow_comparator.file != BAD_FILE) {
987             emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
988                      shadow_comparator));
989          }
990 
991          emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
992                   offset_value));
993          inst->mlen++;
994       }
995    }
996 
997    emit(inst);
998 
999    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1000     * spec requires layers.
1001     */
1002    if (op == ir_txs && devinfo->ver < 7) {
1003       /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
1004       emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1005                   src_reg(inst->dst), brw_imm_d(1));
1006    }
1007 
1008    if (devinfo->ver == 6 && op == ir_tg4) {
1009       emit_gfx6_gather_wa(key_tex->gfx6_gather_wa[surface], inst->dst);
1010    }
1011 
1012    if (op == ir_query_levels) {
1013       /* # levels is in .w */
1014       src_reg swizzled(dest);
1015       swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1016                                       SWIZZLE_W, SWIZZLE_W);
1017       emit(MOV(dest, swizzled));
1018    }
1019 }
1020 
1021 /**
1022  * Apply workarounds for Gfx6 gather with UINT/SINT
1023  */
1024 void
emit_gfx6_gather_wa(uint8_t wa,dst_reg dst)1025 vec4_visitor::emit_gfx6_gather_wa(uint8_t wa, dst_reg dst)
1026 {
1027    if (!wa)
1028       return;
1029 
1030    int width = (wa & WA_8BIT) ? 8 : 16;
1031    dst_reg dst_f = dst;
1032    dst_f.type = BRW_REGISTER_TYPE_F;
1033 
1034    /* Convert from UNORM to UINT */
1035    emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1036    emit(MOV(dst, src_reg(dst_f)));
1037 
1038    if (wa & WA_SIGN) {
1039       /* Reinterpret the UINT value as a signed INT value by
1040        * shifting the sign bit into place, then shifting back
1041        * preserving sign.
1042        */
1043       emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1044       emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1045    }
1046 }
1047 
1048 void
gs_emit_vertex(int)1049 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1050 {
1051    unreachable("not reached");
1052 }
1053 
1054 void
gs_end_primitive()1055 vec4_visitor::gs_end_primitive()
1056 {
1057    unreachable("not reached");
1058 }
1059 
1060 void
emit_ndc_computation()1061 vec4_visitor::emit_ndc_computation()
1062 {
1063    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1064       return;
1065 
1066    /* Get the position */
1067    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1068 
1069    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1070    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1071    output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1072    output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1073 
1074    current_annotation = "NDC";
1075    dst_reg ndc_w = ndc;
1076    ndc_w.writemask = WRITEMASK_W;
1077    src_reg pos_w = pos;
1078    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1079    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1080 
1081    dst_reg ndc_xyz = ndc;
1082    ndc_xyz.writemask = WRITEMASK_XYZ;
1083 
1084    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1085 }
1086 
1087 void
emit_psiz_and_flags(dst_reg reg)1088 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1089 {
1090    if (devinfo->ver < 6 &&
1091        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1092         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1093         devinfo->has_negative_rhw_bug)) {
1094       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1095       dst_reg header1_w = header1;
1096       header1_w.writemask = WRITEMASK_W;
1097 
1098       emit(MOV(header1, brw_imm_ud(0u)));
1099 
1100       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1101 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1102 
1103 	 current_annotation = "Point size";
1104 	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1105 	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1106       }
1107 
1108       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1109          current_annotation = "Clipping flags";
1110          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1111 
1112          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1113          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1114          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1115       }
1116 
1117       if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1118          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1119          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1120          emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1121          emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1122          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1123       }
1124 
1125       /* i965 clipping workaround:
1126        * 1) Test for -ve rhw
1127        * 2) If set,
1128        *      set ndc = (0,0,0,0)
1129        *      set ucp[6] = 1
1130        *
1131        * Later, clipping will detect ucp[6] and ensure the primitive is
1132        * clipped against all fixed planes.
1133        */
1134       if (devinfo->has_negative_rhw_bug &&
1135           output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1136          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1137          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1138          emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1139          vec4_instruction *inst;
1140          inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1141          inst->predicate = BRW_PREDICATE_NORMAL;
1142          output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1143          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1144          inst->predicate = BRW_PREDICATE_NORMAL;
1145       }
1146 
1147       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1148    } else if (devinfo->ver < 6) {
1149       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1150    } else {
1151       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1152       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1153          dst_reg reg_w = reg;
1154          reg_w.writemask = WRITEMASK_W;
1155          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1156          reg_as_src.type = reg_w.type;
1157          reg_as_src.swizzle = brw_swizzle_for_size(1);
1158          emit(MOV(reg_w, reg_as_src));
1159       }
1160       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1161          dst_reg reg_y = reg;
1162          reg_y.writemask = WRITEMASK_Y;
1163          reg_y.type = BRW_REGISTER_TYPE_D;
1164          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1165          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1166       }
1167       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1168          dst_reg reg_z = reg;
1169          reg_z.writemask = WRITEMASK_Z;
1170          reg_z.type = BRW_REGISTER_TYPE_D;
1171          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1172          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1173       }
1174    }
1175 }
1176 
1177 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)1178 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1179 {
1180    assert(varying < VARYING_SLOT_MAX);
1181 
1182    unsigned num_comps = output_num_components[varying][component];
1183    if (num_comps == 0)
1184       return NULL;
1185 
1186    assert(output_reg[varying][component].type == reg.type);
1187    current_annotation = output_reg_annotation[varying];
1188    if (output_reg[varying][component].file != BAD_FILE) {
1189       src_reg src = src_reg(output_reg[varying][component]);
1190       src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1191       reg.writemask =
1192          brw_writemask_for_component_packing(num_comps, component);
1193       return emit(MOV(reg, src));
1194    }
1195    return NULL;
1196 }
1197 
1198 void
emit_urb_slot(dst_reg reg,int varying)1199 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1200 {
1201    reg.type = BRW_REGISTER_TYPE_F;
1202    output_reg[varying][0].type = reg.type;
1203 
1204    switch (varying) {
1205    case VARYING_SLOT_PSIZ:
1206    {
1207       /* PSIZ is always in slot 0, and is coupled with other flags. */
1208       current_annotation = "indices, point width, clip flags";
1209       emit_psiz_and_flags(reg);
1210       break;
1211    }
1212    case BRW_VARYING_SLOT_NDC:
1213       current_annotation = "NDC";
1214       if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1215          emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1216       break;
1217    case VARYING_SLOT_POS:
1218       current_annotation = "gl_Position";
1219       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1220          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1221       break;
1222    case BRW_VARYING_SLOT_PAD:
1223       /* No need to write to this slot */
1224       break;
1225    default:
1226       for (int i = 0; i < 4; i++) {
1227          emit_generic_urb_slot(reg, varying, i);
1228       }
1229       break;
1230    }
1231 }
1232 
1233 static unsigned
align_interleaved_urb_mlen(const struct intel_device_info * devinfo,unsigned mlen)1234 align_interleaved_urb_mlen(const struct intel_device_info *devinfo,
1235                            unsigned mlen)
1236 {
1237    if (devinfo->ver >= 6) {
1238       /* URB data written (does not include the message header reg) must
1239        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1240        * section 5.4.3.2.2: URB_INTERLEAVED.
1241        *
1242        * URB entries are allocated on a multiple of 1024 bits, so an
1243        * extra 128 bits written here to make the end align to 256 is
1244        * no problem.
1245        */
1246       if ((mlen % 2) != 1)
1247 	 mlen++;
1248    }
1249 
1250    return mlen;
1251 }
1252 
1253 
1254 /**
1255  * Generates the VUE payload plus the necessary URB write instructions to
1256  * output it.
1257  *
1258  * The VUE layout is documented in Volume 2a.
1259  */
1260 void
emit_vertex()1261 vec4_visitor::emit_vertex()
1262 {
1263    /* MRF 0 is reserved for the debugger, so start with message header
1264     * in MRF 1.
1265     */
1266    int base_mrf = 1;
1267    int mrf = base_mrf;
1268    /* In the process of generating our URB write message contents, we
1269     * may need to unspill a register or load from an array.  Those
1270     * reads would use MRFs 14-15.
1271     */
1272    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
1273 
1274    /* The following assertion verifies that max_usable_mrf causes an
1275     * even-numbered amount of URB write data, which will meet gfx6's
1276     * requirements for length alignment.
1277     */
1278    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1279 
1280    /* First mrf is the g0-based message header containing URB handles and
1281     * such.
1282     */
1283    emit_urb_write_header(mrf++);
1284 
1285    if (devinfo->ver < 6) {
1286       emit_ndc_computation();
1287    }
1288 
1289    /* We may need to split this up into several URB writes, so do them in a
1290     * loop.
1291     */
1292    int slot = 0;
1293    bool complete = false;
1294    do {
1295       /* URB offset is in URB row increments, and each of our MRFs is half of
1296        * one of those, since we're doing interleaved writes.
1297        */
1298       int offset = slot / 2;
1299 
1300       mrf = base_mrf + 1;
1301       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1302          emit_urb_slot(dst_reg(MRF, mrf++),
1303                        prog_data->vue_map.slot_to_varying[slot]);
1304 
1305          /* If this was max_usable_mrf, we can't fit anything more into this
1306           * URB WRITE. Same thing if we reached the maximum length available.
1307           */
1308          if (mrf > max_usable_mrf ||
1309              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1310             slot++;
1311             break;
1312          }
1313       }
1314 
1315       complete = slot >= prog_data->vue_map.num_slots;
1316       current_annotation = "URB write";
1317       vec4_instruction *inst = emit_urb_write_opcode(complete);
1318       inst->base_mrf = base_mrf;
1319       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1320       inst->offset += offset;
1321    } while(!complete);
1322 }
1323 
1324 
1325 src_reg
get_scratch_offset(bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1326 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1327 				 src_reg *reladdr, int reg_offset)
1328 {
1329    /* Because we store the values to scratch interleaved like our
1330     * vertex data, we need to scale the vec4 index by 2.
1331     */
1332    int message_header_scale = 2;
1333 
1334    /* Pre-gfx6, the message header uses byte offsets instead of vec4
1335     * (16-byte) offset units.
1336     */
1337    if (devinfo->ver < 6)
1338       message_header_scale *= 16;
1339 
1340    if (reladdr) {
1341       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1342        * to multiply the reladdr by 2. Notice that the reg_offset part
1343        * is in units of 16 bytes and is used to select the low/high 16-byte
1344        * chunk of a full dvec4, so we don't want to multiply that part.
1345        */
1346       src_reg index = src_reg(this, glsl_type::int_type);
1347       if (type_sz(inst->dst.type) < 8) {
1348          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1349                                       brw_imm_d(reg_offset)));
1350          emit_before(block, inst, MUL(dst_reg(index), index,
1351                                       brw_imm_d(message_header_scale)));
1352       } else {
1353          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1354                                       brw_imm_d(message_header_scale * 2)));
1355          emit_before(block, inst, ADD(dst_reg(index), index,
1356                                       brw_imm_d(reg_offset * message_header_scale)));
1357       }
1358       return index;
1359    } else {
1360       return brw_imm_d(reg_offset * message_header_scale);
1361    }
1362 }
1363 
1364 /**
1365  * Emits an instruction before @inst to load the value named by @orig_src
1366  * from scratch space at @base_offset to @temp.
1367  *
1368  * @base_offset is measured in 32-byte units (the size of a register).
1369  */
1370 void
emit_scratch_read(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1371 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1372 				dst_reg temp, src_reg orig_src,
1373 				int base_offset)
1374 {
1375    assert(orig_src.offset % REG_SIZE == 0);
1376    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1377    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1378                                       reg_offset);
1379 
1380    if (type_sz(orig_src.type) < 8) {
1381       emit_before(block, inst, SCRATCH_READ(temp, index));
1382    } else {
1383       dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1384       dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1385       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1386       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1387       vec4_instruction *last_read =
1388          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1389       emit_before(block, inst, last_read);
1390       shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
1391    }
1392 }
1393 
1394 /**
1395  * Emits an instruction after @inst to store the value to be written
1396  * to @orig_dst to scratch space at @base_offset, from @temp.
1397  *
1398  * @base_offset is measured in 32-byte units (the size of a register).
1399  */
1400 void
emit_scratch_write(bblock_t * block,vec4_instruction * inst,int base_offset)1401 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1402                                  int base_offset)
1403 {
1404    assert(inst->dst.offset % REG_SIZE == 0);
1405    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1406    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1407                                       reg_offset);
1408 
1409    /* Create a temporary register to store *inst's result in.
1410     *
1411     * We have to be careful in MOVing from our temporary result register in
1412     * the scratch write.  If we swizzle from channels of the temporary that
1413     * weren't initialized, it will confuse live interval analysis, which will
1414     * make spilling fail to make progress.
1415     */
1416    bool is_64bit = type_sz(inst->dst.type) == 8;
1417    const glsl_type *alloc_type =
1418       is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1419    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1420                                        inst->dst.type),
1421                                 brw_swizzle_for_mask(inst->dst.writemask));
1422 
1423    if (!is_64bit) {
1424       dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1425 				          inst->dst.writemask));
1426       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1427       if (inst->opcode != BRW_OPCODE_SEL)
1428          write->predicate = inst->predicate;
1429       write->ir = inst->ir;
1430       write->annotation = inst->annotation;
1431       inst->insert_after(block, write);
1432    } else {
1433       dst_reg shuffled = dst_reg(this, alloc_type);
1434       vec4_instruction *last =
1435          shuffle_64bit_data(shuffled, temp, true, true, block, inst);
1436       src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1437 
1438       uint8_t mask = 0;
1439       if (inst->dst.writemask & WRITEMASK_X)
1440          mask |= WRITEMASK_XY;
1441       if (inst->dst.writemask & WRITEMASK_Y)
1442          mask |= WRITEMASK_ZW;
1443       if (mask) {
1444          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1445 
1446          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1447          if (inst->opcode != BRW_OPCODE_SEL)
1448             write->predicate = inst->predicate;
1449          write->ir = inst->ir;
1450          write->annotation = inst->annotation;
1451          last->insert_after(block, write);
1452       }
1453 
1454       mask = 0;
1455       if (inst->dst.writemask & WRITEMASK_Z)
1456          mask |= WRITEMASK_XY;
1457       if (inst->dst.writemask & WRITEMASK_W)
1458          mask |= WRITEMASK_ZW;
1459       if (mask) {
1460          dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1461 
1462          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1463                                             reg_offset + 1);
1464          vec4_instruction *write =
1465             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1466          if (inst->opcode != BRW_OPCODE_SEL)
1467             write->predicate = inst->predicate;
1468          write->ir = inst->ir;
1469          write->annotation = inst->annotation;
1470          last->insert_after(block, write);
1471       }
1472    }
1473 
1474    inst->dst.file = temp.file;
1475    inst->dst.nr = temp.nr;
1476    inst->dst.offset %= REG_SIZE;
1477    inst->dst.reladdr = NULL;
1478 }
1479 
1480 /**
1481  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1482  * adds the scratch read(s) before \p inst. The function also checks for
1483  * recursive reladdr scratch accesses, issuing the corresponding scratch
1484  * loads and rewriting reladdr references accordingly.
1485  *
1486  * \return \p src if it did not require a scratch load, otherwise, the
1487  * register holding the result of the scratch load that the caller should
1488  * use to rewrite src.
1489  */
1490 src_reg
emit_resolve_reladdr(int scratch_loc[],bblock_t * block,vec4_instruction * inst,src_reg src)1491 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1492                                    vec4_instruction *inst, src_reg src)
1493 {
1494    /* Resolve recursive reladdr scratch access by calling ourselves
1495     * with src.reladdr
1496     */
1497    if (src.reladdr)
1498       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1499                                           *src.reladdr);
1500 
1501    /* Now handle scratch access on src */
1502    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1503       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1504          glsl_type::dvec4_type : glsl_type::vec4_type);
1505       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1506       src.nr = temp.nr;
1507       src.offset %= REG_SIZE;
1508       src.reladdr = NULL;
1509    }
1510 
1511    return src;
1512 }
1513 
1514 /**
1515  * We can't generally support array access in GRF space, because a
1516  * single instruction's destination can only span 2 contiguous
1517  * registers.  So, we send all GRF arrays that get variable index
1518  * access to scratch space.
1519  */
1520 void
move_grf_array_access_to_scratch()1521 vec4_visitor::move_grf_array_access_to_scratch()
1522 {
1523    int scratch_loc[this->alloc.count];
1524    memset(scratch_loc, -1, sizeof(scratch_loc));
1525 
1526    /* First, calculate the set of virtual GRFs that need to be punted
1527     * to scratch due to having any array access on them, and where in
1528     * scratch.
1529     */
1530    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1531       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1532          if (scratch_loc[inst->dst.nr] == -1) {
1533             scratch_loc[inst->dst.nr] = last_scratch;
1534             last_scratch += this->alloc.sizes[inst->dst.nr];
1535          }
1536 
1537          for (src_reg *iter = inst->dst.reladdr;
1538               iter->reladdr;
1539               iter = iter->reladdr) {
1540             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1541                scratch_loc[iter->nr] = last_scratch;
1542                last_scratch += this->alloc.sizes[iter->nr];
1543             }
1544          }
1545       }
1546 
1547       for (int i = 0 ; i < 3; i++) {
1548          for (src_reg *iter = &inst->src[i];
1549               iter->reladdr;
1550               iter = iter->reladdr) {
1551             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1552                scratch_loc[iter->nr] = last_scratch;
1553                last_scratch += this->alloc.sizes[iter->nr];
1554             }
1555          }
1556       }
1557    }
1558 
1559    /* Now, for anything that will be accessed through scratch, rewrite
1560     * it to load/store.  Note that this is a _safe list walk, because
1561     * we may generate a new scratch_write instruction after the one
1562     * we're processing.
1563     */
1564    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1565       /* Set up the annotation tracking for new generated instructions. */
1566       base_ir = inst->ir;
1567       current_annotation = inst->annotation;
1568 
1569       /* First handle scratch access on the dst. Notice we have to handle
1570        * the case where the dst's reladdr also points to scratch space.
1571        */
1572       if (inst->dst.reladdr)
1573          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1574                                                    *inst->dst.reladdr);
1575 
1576       /* Now that we have handled any (possibly recursive) reladdr scratch
1577        * accesses for dst we can safely do the scratch write for dst itself
1578        */
1579       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1580          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1581 
1582       /* Now handle scratch access on any src. In this case, since inst->src[i]
1583        * already is a src_reg, we can just call emit_resolve_reladdr with
1584        * inst->src[i] and it will take care of handling scratch loads for
1585        * both src and src.reladdr (recursively).
1586        */
1587       for (int i = 0 ; i < 3; i++) {
1588          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1589                                              inst->src[i]);
1590       }
1591    }
1592 }
1593 
1594 /**
1595  * Emits an instruction before @inst to load the value named by @orig_src
1596  * from the pull constant buffer (surface) at @base_offset to @temp.
1597  */
1598 void
emit_pull_constant_load(bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset,src_reg indirect)1599 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1600                                       dst_reg temp, src_reg orig_src,
1601                                       int base_offset, src_reg indirect)
1602 {
1603    assert(orig_src.offset % 16 == 0);
1604    const unsigned index = prog_data->base.binding_table.pull_constants_start;
1605 
1606    /* For 64bit loads we need to emit two 32-bit load messages and we also
1607     * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1608     * that we emit the 32-bit loads into a temporary and we shuffle the result
1609     * into the original destination.
1610     */
1611    dst_reg orig_temp = temp;
1612    bool is_64bit = type_sz(orig_src.type) == 8;
1613    if (is_64bit) {
1614       assert(type_sz(temp.type) == 8);
1615       dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1616       temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1617    }
1618 
1619    src_reg src = orig_src;
1620    for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1621       int reg_offset = base_offset + src.offset / 16;
1622 
1623       src_reg offset;
1624       if (indirect.file != BAD_FILE) {
1625          offset = src_reg(this, glsl_type::uint_type);
1626          emit_before(block, inst, ADD(dst_reg(offset), indirect,
1627                                       brw_imm_ud(reg_offset * 16)));
1628       } else {
1629          offset = brw_imm_d(reg_offset * 16);
1630       }
1631 
1632       emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1633                                   brw_imm_ud(index),
1634                                   offset,
1635                                   block, inst);
1636 
1637       src = byte_offset(src, 16);
1638    }
1639 
1640    if (is_64bit) {
1641       temp = retype(temp, BRW_REGISTER_TYPE_DF);
1642       shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst);
1643    }
1644 }
1645 
1646 /**
1647  * Implements array access of uniforms by inserting a
1648  * PULL_CONSTANT_LOAD instruction.
1649  *
1650  * Unlike temporary GRF array access (where we don't support it due to
1651  * the difficulty of doing relative addressing on instruction
1652  * destinations), we could potentially do array access of uniforms
1653  * that were loaded in GRF space as push constants.  In real-world
1654  * usage we've seen, though, the arrays being used are always larger
1655  * than we could load as push constants, so just always move all
1656  * uniform array access out to a pull constant buffer.
1657  */
1658 void
move_uniform_array_access_to_pull_constants()1659 vec4_visitor::move_uniform_array_access_to_pull_constants()
1660 {
1661    /* The vulkan dirver doesn't support pull constants other than UBOs so
1662     * everything has to be pushed regardless.
1663     */
1664    if (!compiler->supports_pull_constants) {
1665       split_uniform_registers();
1666       return;
1667    }
1668 
1669    /* Allocate the pull_params array */
1670    assert(stage_prog_data->nr_pull_params == 0);
1671    stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1672                                               this->uniforms * 4);
1673 
1674    int pull_constant_loc[this->uniforms];
1675    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1676 
1677    /* First, walk through the instructions and determine which things need to
1678     * be pulled.  We mark something as needing to be pulled by setting
1679     * pull_constant_loc to 0.
1680     */
1681    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1682       /* We only care about MOV_INDIRECT of a uniform */
1683       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1684           inst->src[0].file != UNIFORM)
1685          continue;
1686 
1687       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1688 
1689       for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1690          pull_constant_loc[uniform_nr + j] = 0;
1691    }
1692 
1693    /* Next, we walk the list of uniforms and assign real pull constant
1694     * locations and set their corresponding entries in pull_param.
1695     */
1696    for (int j = 0; j < this->uniforms; j++) {
1697       if (pull_constant_loc[j] < 0)
1698          continue;
1699 
1700       pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1701 
1702       for (int i = 0; i < 4; i++) {
1703          stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1704             = stage_prog_data->param[j * 4 + i];
1705       }
1706    }
1707 
1708    /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1709     * instructions to actual uniform pulls.
1710     */
1711    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1712       /* We only care about MOV_INDIRECT of a uniform */
1713       if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1714           inst->src[0].file != UNIFORM)
1715          continue;
1716 
1717       int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1718 
1719       assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1720 
1721       emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1722                               pull_constant_loc[uniform_nr], inst->src[1]);
1723       inst->remove(block);
1724    }
1725 
1726    /* Now there are no accesses of the UNIFORM file with a reladdr, so
1727     * no need to track them as larger-than-vec4 objects.  This will be
1728     * relied on in cutting out unused uniform vectors from push
1729     * constants.
1730     */
1731    split_uniform_registers();
1732 }
1733 
1734 void
resolve_ud_negate(src_reg * reg)1735 vec4_visitor::resolve_ud_negate(src_reg *reg)
1736 {
1737    if (reg->type != BRW_REGISTER_TYPE_UD ||
1738        !reg->negate)
1739       return;
1740 
1741    src_reg temp = src_reg(this, glsl_type::uvec4_type);
1742    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1743    *reg = temp;
1744 }
1745 
vec4_visitor(const struct brw_compiler * compiler,void * log_data,const struct brw_sampler_prog_key_data * key_tex,struct brw_vue_prog_data * prog_data,const nir_shader * shader,void * mem_ctx,bool no_spills,int shader_time_index,bool debug_enabled)1746 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1747                            void *log_data,
1748                            const struct brw_sampler_prog_key_data *key_tex,
1749                            struct brw_vue_prog_data *prog_data,
1750                            const nir_shader *shader,
1751 			   void *mem_ctx,
1752                            bool no_spills,
1753                            int shader_time_index,
1754                            bool debug_enabled)
1755    : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base,
1756                     debug_enabled),
1757      key_tex(key_tex),
1758      prog_data(prog_data),
1759      fail_msg(NULL),
1760      first_non_payload_grf(0),
1761      ubo_push_start(),
1762      push_length(0),
1763      live_analysis(this), performance_analysis(this),
1764      need_all_constants_in_pull_buffer(false),
1765      no_spills(no_spills),
1766      shader_time_index(shader_time_index),
1767      last_scratch(0)
1768 {
1769    this->failed = false;
1770 
1771    this->base_ir = NULL;
1772    this->current_annotation = NULL;
1773    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1774 
1775    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1776 
1777    this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
1778 
1779    this->uniforms = 0;
1780 
1781    this->nir_locals = NULL;
1782    this->nir_ssa_values = NULL;
1783 }
1784 
1785 
1786 void
fail(const char * format,...)1787 vec4_visitor::fail(const char *format, ...)
1788 {
1789    va_list va;
1790    char *msg;
1791 
1792    if (failed)
1793       return;
1794 
1795    failed = true;
1796 
1797    va_start(va, format);
1798    msg = ralloc_vasprintf(mem_ctx, format, va);
1799    va_end(va);
1800    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1801 
1802    this->fail_msg = msg;
1803 
1804    if (unlikely(debug_enabled)) {
1805       fprintf(stderr, "%s",  msg);
1806    }
1807 }
1808 
1809 } /* namespace brw */
1810