1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_VEC4_BUILDER_H
26 #define BRW_VEC4_BUILDER_H
27 
28 #include "brw_ir_vec4.h"
29 #include "brw_ir_allocator.h"
30 
31 namespace brw {
32    /**
33     * Toolbox to assemble a VEC4 IR program out of individual instructions.
34     *
35     * This object is meant to have an interface consistent with
36     * brw::fs_builder.  They cannot be fully interchangeable because
37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
38     * vector code.
39     */
40    class vec4_builder {
41    public:
42       /** Type used in this IR to represent a source of an instruction. */
43       typedef brw::src_reg src_reg;
44 
45       /** Type used in this IR to represent the destination of an instruction. */
46       typedef brw::dst_reg dst_reg;
47 
48       /** Type used in this IR to represent an instruction. */
49       typedef vec4_instruction instruction;
50 
51       /**
52        * Construct a vec4_builder that inserts instructions into \p shader.
53        */
54       vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
shader(shader)55          shader(shader), block(NULL), cursor(NULL),
56          _dispatch_width(dispatch_width), _group(0),
57          force_writemask_all(false),
58          annotation()
59       {
60       }
61 
62       /**
63        * Construct a vec4_builder that inserts instructions into \p shader
64        * before instruction \p inst in basic block \p block.  The default
65        * execution controls and debug annotation are initialized from the
66        * instruction passed as argument.
67        */
vec4_builder(backend_shader * shader,bblock_t * block,instruction * inst)68       vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
69          shader(shader), block(block), cursor(inst),
70          _dispatch_width(inst->exec_size), _group(inst->group),
71          force_writemask_all(inst->force_writemask_all)
72       {
73          annotation.str = inst->annotation;
74          annotation.ir = inst->ir;
75       }
76 
77       /**
78        * Construct a vec4_builder that inserts instructions before \p cursor
79        * in basic block \p block, inheriting other code generation parameters
80        * from this.
81        */
82       vec4_builder
at(bblock_t * block,exec_node * cursor)83       at(bblock_t *block, exec_node *cursor) const
84       {
85          vec4_builder bld = *this;
86          bld.block = block;
87          bld.cursor = cursor;
88          return bld;
89       }
90 
91       /**
92        * Construct a vec4_builder appending instructions at the end of the
93        * instruction list of the shader, inheriting other code generation
94        * parameters from this.
95        */
96       vec4_builder
at_end()97       at_end() const
98       {
99          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
100       }
101 
102       /**
103        * Construct a builder specifying the default SIMD width and group of
104        * channel enable signals, inheriting other code generation parameters
105        * from this.
106        *
107        * \p n gives the default SIMD width, \p i gives the slot group used for
108        * predication and control flow masking in multiples of \p n channels.
109        */
110       vec4_builder
group(unsigned n,unsigned i)111       group(unsigned n, unsigned i) const
112       {
113          assert(force_writemask_all ||
114                 (n <= dispatch_width() && i < dispatch_width() / n));
115          vec4_builder bld = *this;
116          bld._dispatch_width = n;
117          bld._group += i * n;
118          return bld;
119       }
120 
121       /**
122        * Construct a builder with per-channel control flow execution masking
123        * disabled if \p b is true.  If control flow execution masking is
124        * already disabled this has no effect.
125        */
126       vec4_builder
127       exec_all(bool b = true) const
128       {
129          vec4_builder bld = *this;
130          if (b)
131             bld.force_writemask_all = true;
132          return bld;
133       }
134 
135       /**
136        * Construct a builder with the given debug annotation info.
137        */
138       vec4_builder
139       annotate(const char *str, const void *ir = NULL) const
140       {
141          vec4_builder bld = *this;
142          bld.annotation.str = str;
143          bld.annotation.ir = ir;
144          return bld;
145       }
146 
147       /**
148        * Get the SIMD width in use.
149        */
150       unsigned
dispatch_width()151       dispatch_width() const
152       {
153          return _dispatch_width;
154       }
155 
156       /**
157        * Get the channel group in use.
158        */
159       unsigned
group()160       group() const
161       {
162          return _group;
163       }
164 
165       /**
166        * Allocate a virtual register of natural vector size (four for this IR)
167        * and SIMD width.  \p n gives the amount of space to allocate in
168        * dispatch_width units (which is just enough space for four logical
169        * components in this IR).
170        */
171       dst_reg
172       vgrf(enum brw_reg_type type, unsigned n = 1) const
173       {
174          assert(dispatch_width() <= 32);
175 
176          if (n > 0)
177             return retype(dst_reg(VGRF, shader->alloc.allocate(
178                                      n * DIV_ROUND_UP(type_sz(type), 4))),
179                            type);
180          else
181             return retype(null_reg_ud(), type);
182       }
183 
184       /**
185        * Create a null register of floating type.
186        */
187       dst_reg
null_reg_f()188       null_reg_f() const
189       {
190          return dst_reg(retype(brw_null_vec(dispatch_width()),
191                                BRW_REGISTER_TYPE_F));
192       }
193 
194       /**
195        * Create a null register of signed integer type.
196        */
197       dst_reg
null_reg_d()198       null_reg_d() const
199       {
200          return dst_reg(retype(brw_null_vec(dispatch_width()),
201                                BRW_REGISTER_TYPE_D));
202       }
203 
204       /**
205        * Create a null register of unsigned integer type.
206        */
207       dst_reg
null_reg_ud()208       null_reg_ud() const
209       {
210          return dst_reg(retype(brw_null_vec(dispatch_width()),
211                                BRW_REGISTER_TYPE_UD));
212       }
213 
214       /**
215        * Insert an instruction into the program.
216        */
217       instruction *
emit(const instruction & inst)218       emit(const instruction &inst) const
219       {
220          return emit(new(shader->mem_ctx) instruction(inst));
221       }
222 
223       /**
224        * Create and insert a nullary control instruction into the program.
225        */
226       instruction *
emit(enum opcode opcode)227       emit(enum opcode opcode) const
228       {
229          return emit(instruction(opcode));
230       }
231 
232       /**
233        * Create and insert a nullary instruction into the program.
234        */
235       instruction *
emit(enum opcode opcode,const dst_reg & dst)236       emit(enum opcode opcode, const dst_reg &dst) const
237       {
238          return emit(instruction(opcode, dst));
239       }
240 
241       /**
242        * Create and insert a unary instruction into the program.
243        */
244       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)245       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
246       {
247          switch (opcode) {
248          case SHADER_OPCODE_RCP:
249          case SHADER_OPCODE_RSQ:
250          case SHADER_OPCODE_SQRT:
251          case SHADER_OPCODE_EXP2:
252          case SHADER_OPCODE_LOG2:
253          case SHADER_OPCODE_SIN:
254          case SHADER_OPCODE_COS:
255             return fix_math_instruction(
256                emit(instruction(opcode, dst,
257                                 fix_math_operand(src0))));
258 
259          default:
260             return emit(instruction(opcode, dst, src0));
261          }
262       }
263 
264       /**
265        * Create and insert a binary instruction into the program.
266        */
267       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)268       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
269            const src_reg &src1) const
270       {
271          switch (opcode) {
272          case SHADER_OPCODE_POW:
273          case SHADER_OPCODE_INT_QUOTIENT:
274          case SHADER_OPCODE_INT_REMAINDER:
275             return fix_math_instruction(
276                emit(instruction(opcode, dst,
277                                 fix_math_operand(src0),
278                                 fix_math_operand(src1))));
279 
280          default:
281             return emit(instruction(opcode, dst, src0, src1));
282          }
283       }
284 
285       /**
286        * Create and insert a ternary instruction into the program.
287        */
288       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)289       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
290            const src_reg &src1, const src_reg &src2) const
291       {
292          switch (opcode) {
293          case BRW_OPCODE_BFE:
294          case BRW_OPCODE_BFI2:
295          case BRW_OPCODE_MAD:
296          case BRW_OPCODE_LRP:
297             return emit(instruction(opcode, dst,
298                                     fix_3src_operand(src0),
299                                     fix_3src_operand(src1),
300                                     fix_3src_operand(src2)));
301 
302          default:
303             return emit(instruction(opcode, dst, src0, src1, src2));
304          }
305       }
306 
307       /**
308        * Insert a preallocated instruction into the program.
309        */
310       instruction *
emit(instruction * inst)311       emit(instruction *inst) const
312       {
313          inst->exec_size = dispatch_width();
314          inst->group = group();
315          inst->force_writemask_all = force_writemask_all;
316          inst->size_written = inst->exec_size * type_sz(inst->dst.type);
317          inst->annotation = annotation.str;
318          inst->ir = annotation.ir;
319 
320          if (block)
321             static_cast<instruction *>(cursor)->insert_before(block, inst);
322          else
323             cursor->insert_before(inst);
324 
325          return inst;
326       }
327 
328       /**
329        * Select \p src0 if the comparison of both sources with the given
330        * conditional mod evaluates to true, otherwise select \p src1.
331        *
332        * Generally useful to get the minimum or maximum of two values.
333        */
334       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)335       emit_minmax(const dst_reg &dst, const src_reg &src0,
336                   const src_reg &src1, brw_conditional_mod mod) const
337       {
338          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
339 
340          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
341                                      fix_unsigned_negate(src1)));
342       }
343 
344       /**
345        * Copy any live channel from \p src to the first channel of the result.
346        */
347       src_reg
emit_uniformize(const src_reg & src)348       emit_uniformize(const src_reg &src) const
349       {
350          const vec4_builder ubld = exec_all();
351          const dst_reg chan_index =
352             writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
353          const dst_reg dst = vgrf(src.type);
354 
355          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
356          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
357 
358          return src_reg(dst);
359       }
360 
361       /**
362        * Assorted arithmetic ops.
363        * @{
364        */
365 #define ALU1(op)                                        \
366       instruction *                                     \
367       op(const dst_reg &dst, const src_reg &src0) const \
368       {                                                 \
369          return emit(BRW_OPCODE_##op, dst, src0);       \
370       }
371 
372 #define ALU2(op)                                                        \
373       instruction *                                                     \
374       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
375       {                                                                 \
376          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
377       }
378 
379 #define ALU2_ACC(op)                                                    \
380       instruction *                                                     \
381       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
382       {                                                                 \
383          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
384          inst->writes_accumulator = true;                               \
385          return inst;                                                   \
386       }
387 
388 #define ALU3(op)                                                        \
389       instruction *                                                     \
390       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
391          const src_reg &src2) const                                     \
392       {                                                                 \
393          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
394       }
395 
396       ALU2(ADD)
ALU2_ACC(ADDC)397       ALU2_ACC(ADDC)
398       ALU2(AND)
399       ALU2(ASR)
400       ALU2(AVG)
401       ALU3(BFE)
402       ALU2(BFI1)
403       ALU3(BFI2)
404       ALU1(BFREV)
405       ALU1(CBIT)
406       ALU3(CSEL)
407       ALU1(DIM)
408       ALU2(DP2)
409       ALU2(DP3)
410       ALU2(DP4)
411       ALU2(DPH)
412       ALU1(F16TO32)
413       ALU1(F32TO16)
414       ALU1(FBH)
415       ALU1(FBL)
416       ALU1(FRC)
417       ALU2(LINE)
418       ALU1(LZD)
419       ALU2(MAC)
420       ALU2_ACC(MACH)
421       ALU3(MAD)
422       ALU1(MOV)
423       ALU2(MUL)
424       ALU1(NOT)
425       ALU2(OR)
426       ALU2(PLN)
427       ALU1(RNDD)
428       ALU1(RNDE)
429       ALU1(RNDU)
430       ALU1(RNDZ)
431       ALU2(SAD2)
432       ALU2_ACC(SADA2)
433       ALU2(SEL)
434       ALU2(SHL)
435       ALU2(SHR)
436       ALU2_ACC(SUBB)
437       ALU2(XOR)
438 
439 #undef ALU3
440 #undef ALU2_ACC
441 #undef ALU2
442 #undef ALU1
443       /** @} */
444 
445       /**
446        * CMP: Sets the low bit of the destination channels with the result
447        * of the comparison, while the upper bits are undefined, and updates
448        * the flag register with the packed 16 bits of the result.
449        */
450       instruction *
451       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
452           brw_conditional_mod condition) const
453       {
454          /* Take the instruction:
455           *
456           * CMP null<d> src0<f> src1<f>
457           *
458           * Original gfx4 does type conversion to the destination type
459           * before comparison, producing garbage results for floating
460           * point comparisons.
461           *
462           * The destination type doesn't matter on newer generations,
463           * so we set the type to match src0 so we can compact the
464           * instruction.
465           */
466          return set_condmod(condition,
467                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
468                                  fix_unsigned_negate(src0),
469                                  fix_unsigned_negate(src1)));
470       }
471 
472       /**
473        * CMPN: Behaves like CMP, but produces true if src1 is NaN.
474        */
475       instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)476       CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
477           brw_conditional_mod condition) const
478       {
479          /* Take the instruction:
480           *
481           * CMPN null<d> src0<f> src1<f>
482           *
483           * Original gfx4 does type conversion to the destination type
484           * before comparison, producing garbage results for floating
485           * point comparisons.
486           *
487           * The destination type doesn't matter on newer generations,
488           * so we set the type to match src0 so we can compact the
489           * instruction.
490           */
491          return set_condmod(condition,
492                             emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
493                                  fix_unsigned_negate(src0),
494                                  fix_unsigned_negate(src1)));
495       }
496 
497       /**
498        * Gfx4 predicated IF.
499        */
500       instruction *
IF(brw_predicate predicate)501       IF(brw_predicate predicate) const
502       {
503          return set_predicate(predicate, emit(BRW_OPCODE_IF));
504       }
505 
506       /**
507        * Gfx6 IF with embedded comparison.
508        */
509       instruction *
IF(const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)510       IF(const src_reg &src0, const src_reg &src1,
511          brw_conditional_mod condition) const
512       {
513          assert(shader->devinfo->ver == 6);
514          return set_condmod(condition,
515                             emit(BRW_OPCODE_IF,
516                                  null_reg_d(),
517                                  fix_unsigned_negate(src0),
518                                  fix_unsigned_negate(src1)));
519       }
520 
521       /**
522        * Emit a linear interpolation instruction.
523        */
524       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)525       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
526           const src_reg &a) const
527       {
528          /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
529           * we need to reorder the operands.
530           */
531          assert(shader->devinfo->ver >= 6 && shader->devinfo->ver <= 9);
532          return emit(BRW_OPCODE_LRP, dst, a, y, x);
533       }
534 
535       backend_shader *shader;
536 
537    protected:
538       /**
539        * Workaround for negation of UD registers.  See comment in
540        * fs_generator::generate_code() for the details.
541        */
542       src_reg
fix_unsigned_negate(const src_reg & src)543       fix_unsigned_negate(const src_reg &src) const
544       {
545          if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
546             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
547             MOV(temp, src);
548             return src_reg(temp);
549          } else {
550             return src;
551          }
552       }
553 
554       /**
555        * Workaround for register access modes not supported by the ternary
556        * instruction encoding.
557        */
558       src_reg
fix_3src_operand(const src_reg & src)559       fix_3src_operand(const src_reg &src) const
560       {
561          /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
562           * able to use vertical stride of zero to replicate the vec4 uniform, like
563           *
564           *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
565           *
566           * But you can't, since vertical stride is always four in three-source
567           * instructions. Instead, insert a MOV instruction to do the replication so
568           * that the three-source instruction can consume it.
569           */
570 
571          /* The MOV is only needed if the source is a uniform or immediate. */
572          if (src.file != UNIFORM && src.file != IMM)
573             return src;
574 
575          if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
576             return src;
577 
578          const dst_reg expanded = vgrf(src.type);
579          emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
580          return src_reg(expanded);
581       }
582 
583       /**
584        * Workaround for register access modes not supported by the math
585        * instruction.
586        */
587       src_reg
fix_math_operand(const src_reg & src)588       fix_math_operand(const src_reg &src) const
589       {
590          /* The gfx6 math instruction ignores the source modifiers --
591           * swizzle, abs, negate, and at least some parts of the register
592           * region description.
593           *
594           * Rather than trying to enumerate all these cases, *always* expand the
595           * operand to a temp GRF for gfx6.
596           *
597           * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
598           * can't use.
599           */
600          if (shader->devinfo->ver == 6 ||
601              (shader->devinfo->ver == 7 && src.file == IMM)) {
602             const dst_reg tmp = vgrf(src.type);
603             MOV(tmp, src);
604             return src_reg(tmp);
605          } else {
606             return src;
607          }
608       }
609 
610       /**
611        * Workaround other weirdness of the math instruction.
612        */
613       instruction *
fix_math_instruction(instruction * inst)614       fix_math_instruction(instruction *inst) const
615       {
616          if (shader->devinfo->ver == 6 &&
617              inst->dst.writemask != WRITEMASK_XYZW) {
618             const dst_reg tmp = vgrf(inst->dst.type);
619             MOV(inst->dst, src_reg(tmp));
620             inst->dst = tmp;
621 
622          } else if (shader->devinfo->ver < 6) {
623             const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
624             inst->base_mrf = 1;
625             inst->mlen = sources;
626          }
627 
628          return inst;
629       }
630 
631       bblock_t *block;
632       exec_node *cursor;
633 
634       unsigned _dispatch_width;
635       unsigned _group;
636       bool force_writemask_all;
637 
638       /** Debug annotation info. */
639       struct {
640          const char *str;
641          const void *ir;
642       } annotation;
643    };
644 }
645 
646 #endif
647