1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35 
36 #include "util/ralloc.h"
37 
38 /**
39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40  * registers, implicitly moving the operand to a message register.
41  *
42  * On Sandybridge, this is no longer the case.  This function performs the
43  * explicit move; it should be called before emitting a SEND instruction.
44  */
45 void
gen6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gen6_resolve_implied_move(struct brw_codegen *p,
47 			  struct brw_reg *src,
48 			  unsigned msg_reg_nr)
49 {
50    const struct gen_device_info *devinfo = p->devinfo;
51    if (devinfo->gen < 6)
52       return;
53 
54    if (src->file == BRW_MESSAGE_REGISTER_FILE)
55       return;
56 
57    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58       assert(devinfo->gen < 12);
59       brw_push_insn_state(p);
60       brw_set_default_exec_size(p, BRW_EXECUTE_8);
61       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 	      retype(*src, BRW_REGISTER_TYPE_UD));
65       brw_pop_insn_state(p);
66    }
67    *src = brw_message_reg(msg_reg_nr);
68 }
69 
70 static void
gen7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74     * "The send with EOT should use register space R112-R127 for <src>. This is
75     *  to enable loading of a new thread into the same slot while the message
76     *  with EOT for current thread is pending dispatch."
77     *
78     * Since we're pretending to have 16 MRFs anyway, we may as well use the
79     * registers required for messages with EOT.
80     */
81    const struct gen_device_info *devinfo = p->devinfo;
82    if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83       reg->file = BRW_GENERAL_REGISTER_FILE;
84       reg->nr += GEN7_MRF_HACK_START;
85    }
86 }
87 
88 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91    const struct gen_device_info *devinfo = p->devinfo;
92 
93    if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94       assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
95    else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96       assert(dest.nr < 128);
97 
98    /* The hardware has a restriction where a destination of size Byte with
99     * a stride of 1 is only allowed for a packed byte MOV. For any other
100     * instruction, the stride must be at least 2, even when the destination
101     * is the NULL register.
102     */
103    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104        dest.nr == BRW_ARF_NULL &&
105        type_sz(dest.type) == 1 &&
106        dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107       dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108    }
109 
110    gen7_convert_mrf_to_grf(p, &dest);
111 
112    if (devinfo->gen >= 12 &&
113        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118       assert(dest.subnr == 0);
119       assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120              (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121               dest.vstride == dest.width + 1));
122       assert(!dest.negate && !dest.abs);
123       brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125 
126    } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127               brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128       assert(devinfo->gen < 12);
129       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132       assert(dest.subnr % 16 == 0);
133       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134              dest.vstride == dest.width + 1);
135       assert(!dest.negate && !dest.abs);
136       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137       brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138       brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139    } else {
140       brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141       brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142 
143       if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144          brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145 
146          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147             brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151          } else {
152             brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153             brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154             if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155                 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156                assert(dest.writemask != 0);
157             }
158             /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159              *    Although Dst.HorzStride is a don't care for Align16, HW needs
160              *    this to be programmed as "01".
161              */
162             brw_inst_set_dst_hstride(devinfo, inst, 1);
163          }
164       } else {
165          brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166 
167          /* These are different sizes in align1 vs align16:
168           */
169          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170             brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171                                           dest.indirect_offset);
172             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175          } else {
176             brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177                                            dest.indirect_offset);
178             /* even ignored in da16, still need to set as '01' */
179             brw_inst_set_dst_hstride(devinfo, inst, 1);
180          }
181       }
182    }
183 
184    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
186     * small registers, it can be useful for us to automatically reduce it to
187     * match the register size.
188     */
189    if (p->automatic_exec_sizes) {
190       /*
191        * In platforms that support fp64 we can emit instructions with a width
192        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193        * these cases we need to make sure that these instructions have their
194        * exec sizes set properly when they are emitted and we can't rely on
195        * this code to fix it.
196        */
197       bool fix_exec_size;
198       if (devinfo->gen >= 6)
199          fix_exec_size = dest.width < BRW_EXECUTE_4;
200       else
201          fix_exec_size = dest.width < BRW_EXECUTE_8;
202 
203       if (fix_exec_size)
204          brw_inst_set_exec_size(devinfo, inst, dest.width);
205    }
206 }
207 
208 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211    const struct gen_device_info *devinfo = p->devinfo;
212 
213    if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214       assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
215    else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216       assert(reg.nr < 128);
217 
218    gen7_convert_mrf_to_grf(p, &reg);
219 
220    if (devinfo->gen >= 6 &&
221        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225       /* Any source modifiers or regions will be ignored, since this just
226        * identifies the MRF/GRF to start reading the message contents from.
227        * Check for some likely failures.
228        */
229       assert(!reg.negate);
230       assert(!reg.abs);
231       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232    }
233 
234    if (devinfo->gen >= 12 &&
235        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237       assert(reg.file != BRW_IMMEDIATE_VALUE);
238       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239       assert(reg.subnr == 0);
240       assert(has_scalar_region(reg) ||
241              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242               reg.vstride == reg.width + 1));
243       assert(!reg.negate && !reg.abs);
244       brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246 
247    } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248               brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249       assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251       assert(reg.subnr % 16 == 0);
252       assert(has_scalar_region(reg) ||
253              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254               reg.vstride == reg.width + 1));
255       assert(!reg.negate && !reg.abs);
256       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257       brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258    } else {
259       brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260       brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261       brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262       brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263 
264       if (reg.file == BRW_IMMEDIATE_VALUE) {
265          if (reg.type == BRW_REGISTER_TYPE_DF ||
266              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
267             brw_inst_set_imm_df(devinfo, inst, reg.df);
268          else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269                   reg.type == BRW_REGISTER_TYPE_Q)
270             brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271          else
272             brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273 
274          if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
275             brw_inst_set_src1_reg_file(devinfo, inst,
276                                        BRW_ARCHITECTURE_REGISTER_FILE);
277             brw_inst_set_src1_reg_hw_type(devinfo, inst,
278                                           brw_inst_src0_reg_hw_type(devinfo, inst));
279          }
280       } else {
281          if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282             brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284                 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285             } else {
286                brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287             }
288          } else {
289             brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290 
291             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292                brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293             } else {
294                brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295             }
296          }
297 
298          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299             if (reg.width == BRW_WIDTH_1 &&
300                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301                brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302                brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304             } else {
305                brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306                brw_inst_set_src0_width(devinfo, inst, reg.width);
307                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308             }
309          } else {
310             brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312             brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314             brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316             brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318 
319             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320                /* This is an oddity of the fact we're using the same
321                 * descriptions for registers in align_16 as align_1:
322                 */
323                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324             } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
325                        reg.type == BRW_REGISTER_TYPE_DF &&
326                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
327                /* From SNB PRM:
328                 *
329                 * "For Align16 access mode, only encodings of 0000 and 0011
330                 *  are allowed. Other codes are reserved."
331                 *
332                 * Presumably the DevSNB behavior applies to IVB as well.
333                 */
334                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335             } else {
336                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337             }
338          }
339       }
340    }
341 }
342 
343 
344 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)345 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346 {
347    const struct gen_device_info *devinfo = p->devinfo;
348 
349    if (reg.file == BRW_GENERAL_REGISTER_FILE)
350       assert(reg.nr < 128);
351 
352    if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
353        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
354        (devinfo->gen >= 12 &&
355         (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
356          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
357       assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358              reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360       assert(reg.subnr == 0);
361       assert(has_scalar_region(reg) ||
362              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363               reg.vstride == reg.width + 1));
364       assert(!reg.negate && !reg.abs);
365       brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366       brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367    } else {
368       /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369        *
370        *    "Accumulator registers may be accessed explicitly as src0
371        *    operands only."
372        */
373       assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374              reg.nr != BRW_ARF_ACCUMULATOR);
375 
376       gen7_convert_mrf_to_grf(p, &reg);
377       assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378 
379       brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380       brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381       brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382 
383       /* Only src1 can be immediate in two-argument instructions.
384        */
385       assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386 
387       if (reg.file == BRW_IMMEDIATE_VALUE) {
388          /* two-argument instructions can only use 32-bit immediates */
389          assert(type_sz(reg.type) < 8);
390          brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391       } else {
392          /* This is a hardware restriction, which may or may not be lifted
393           * in the future:
394           */
395          assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396          /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397 
398          brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400             brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401          } else {
402             brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403          }
404 
405          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406             if (reg.width == BRW_WIDTH_1 &&
407                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408                brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409                brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411             } else {
412                brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413                brw_inst_set_src1_width(devinfo, inst, reg.width);
414                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415             }
416          } else {
417             brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419             brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421             brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423             brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425 
426             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427                /* This is an oddity of the fact we're using the same
428                 * descriptions for registers in align_16 as align_1:
429                 */
430                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431             } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
432                        reg.type == BRW_REGISTER_TYPE_DF &&
433                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
434                /* From SNB PRM:
435                 *
436                 * "For Align16 access mode, only encodings of 0000 and 0011
437                 *  are allowed. Other codes are reserved."
438                 *
439                 * Presumably the DevSNB behavior applies to IVB as well.
440                 */
441                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442             } else {
443                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444             }
445          }
446       }
447    }
448 }
449 
450 /**
451  * Specify the descriptor and extended descriptor immediate for a SEND(C)
452  * message instruction.
453  */
454 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)455 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456                 unsigned desc, unsigned ex_desc)
457 {
458    const struct gen_device_info *devinfo = p->devinfo;
459    assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
460           brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
461    if (devinfo->gen < 12)
462       brw_inst_set_src1_file_type(devinfo, inst,
463                                   BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464    brw_inst_set_send_desc(devinfo, inst, desc);
465    if (devinfo->gen >= 9)
466       brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468 
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void brw_set_math_message( struct brw_codegen *p,
470 				  brw_inst *inst,
471 				  unsigned function,
472 				  unsigned integer_type,
473 				  bool low_precision,
474 				  unsigned dataType )
475 {
476    const struct gen_device_info *devinfo = p->devinfo;
477    unsigned msg_length;
478    unsigned response_length;
479 
480    /* Infer message length from the function */
481    switch (function) {
482    case BRW_MATH_FUNCTION_POW:
483    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486       msg_length = 2;
487       break;
488    default:
489       msg_length = 1;
490       break;
491    }
492 
493    /* Infer response length from the function */
494    switch (function) {
495    case BRW_MATH_FUNCTION_SINCOS:
496    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497       response_length = 2;
498       break;
499    default:
500       response_length = 1;
501       break;
502    }
503 
504    brw_set_desc(p, inst, brw_message_desc(
505                    devinfo, msg_length, response_length, false));
506 
507    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508    brw_inst_set_math_msg_function(devinfo, inst, function);
509    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511    brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512    brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513    brw_inst_set_saturate(devinfo, inst, 0);
514 }
515 
516 
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void brw_set_ff_sync_message(struct brw_codegen *p,
518 				    brw_inst *insn,
519 				    bool allocate,
520 				    unsigned response_length,
521 				    bool end_of_thread)
522 {
523    const struct gen_device_info *devinfo = p->devinfo;
524 
525    brw_set_desc(p, insn, brw_message_desc(
526                    devinfo, 1, response_length, true));
527 
528    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529    brw_inst_set_eot(devinfo, insn, end_of_thread);
530    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531    brw_inst_set_urb_allocate(devinfo, insn, allocate);
532    /* The following fields are not used by FF_SYNC: */
533    brw_inst_set_urb_global_offset(devinfo, insn, 0);
534    brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535    brw_inst_set_urb_used(devinfo, insn, 0);
536    brw_inst_set_urb_complete(devinfo, insn, 0);
537 }
538 
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void brw_set_urb_message( struct brw_codegen *p,
540 				 brw_inst *insn,
541                                  enum brw_urb_write_flags flags,
542 				 unsigned msg_length,
543 				 unsigned response_length,
544 				 unsigned offset,
545 				 unsigned swizzle_control )
546 {
547    const struct gen_device_info *devinfo = p->devinfo;
548 
549    assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550    assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551    assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552 
553    brw_set_desc(p, insn, brw_message_desc(
554                    devinfo, msg_length, response_length, true));
555 
556    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557    brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558 
559    if (flags & BRW_URB_WRITE_OWORD) {
560       assert(msg_length == 2); /* header + one OWORD of data */
561       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562    } else {
563       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564    }
565 
566    brw_inst_set_urb_global_offset(devinfo, insn, offset);
567    brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568 
569    if (devinfo->gen < 8) {
570       brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571    }
572 
573    if (devinfo->gen < 7) {
574       brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575       brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576    } else {
577       brw_inst_set_urb_per_slot_offset(devinfo, insn,
578          !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579    }
580 }
581 
582 static void
gen7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gen7_set_dp_scratch_message(struct brw_codegen *p,
584                             brw_inst *inst,
585                             bool write,
586                             bool dword,
587                             bool invalidate_after_read,
588                             unsigned num_regs,
589                             unsigned addr_offset,
590                             unsigned mlen,
591                             unsigned rlen,
592                             bool header_present)
593 {
594    const struct gen_device_info *devinfo = p->devinfo;
595    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596           (devinfo->gen >= 8 && num_regs == 8));
597    const unsigned block_size = (devinfo->gen >= 8 ? util_logbase2(num_regs) :
598                                 num_regs - 1);
599 
600    brw_set_desc(p, inst, brw_message_desc(
601                    devinfo, mlen, rlen, header_present));
602 
603    brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
604    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605    brw_inst_set_scratch_read_write(devinfo, inst, write);
606    brw_inst_set_scratch_type(devinfo, inst, dword);
607    brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608    brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611 
612 static void
brw_inst_set_state(const struct gen_device_info * devinfo,brw_inst * insn,const struct brw_insn_state * state)613 brw_inst_set_state(const struct gen_device_info *devinfo,
614                    brw_inst *insn,
615                    const struct brw_insn_state *state)
616 {
617    brw_inst_set_exec_size(devinfo, insn, state->exec_size);
618    brw_inst_set_group(devinfo, insn, state->group);
619    brw_inst_set_compression(devinfo, insn, state->compressed);
620    brw_inst_set_access_mode(devinfo, insn, state->access_mode);
621    brw_inst_set_mask_control(devinfo, insn, state->mask_control);
622    if (devinfo->gen >= 12)
623       brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb));
624    brw_inst_set_saturate(devinfo, insn, state->saturate);
625    brw_inst_set_pred_control(devinfo, insn, state->predicate);
626    brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
627 
628    if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
629        state->access_mode == BRW_ALIGN_16) {
630       brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
631       if (devinfo->gen >= 7)
632          brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
633    } else {
634       brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
635       if (devinfo->gen >= 7)
636          brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
637    }
638 
639    if (devinfo->gen >= 6)
640       brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
641 }
642 
643 #define next_insn brw_next_insn
644 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)645 brw_next_insn(struct brw_codegen *p, unsigned opcode)
646 {
647    const struct gen_device_info *devinfo = p->devinfo;
648    brw_inst *insn;
649 
650    if (p->nr_insn + 1 > p->store_size) {
651       p->store_size <<= 1;
652       p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
653    }
654 
655    p->next_insn_offset += 16;
656    insn = &p->store[p->nr_insn++];
657 
658    memset(insn, 0, sizeof(*insn));
659    brw_inst_set_opcode(devinfo, insn, opcode);
660 
661    /* Apply the default instruction state */
662    brw_inst_set_state(devinfo, insn, p->current);
663 
664    return insn;
665 }
666 
667 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)668 brw_alu1(struct brw_codegen *p, unsigned opcode,
669          struct brw_reg dest, struct brw_reg src)
670 {
671    brw_inst *insn = next_insn(p, opcode);
672    brw_set_dest(p, insn, dest);
673    brw_set_src0(p, insn, src);
674    return insn;
675 }
676 
677 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)678 brw_alu2(struct brw_codegen *p, unsigned opcode,
679          struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
680 {
681    /* 64-bit immediates are only supported on 1-src instructions */
682    assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
683    assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
684 
685    brw_inst *insn = next_insn(p, opcode);
686    brw_set_dest(p, insn, dest);
687    brw_set_src0(p, insn, src0);
688    brw_set_src1(p, insn, src1);
689    return insn;
690 }
691 
692 static int
get_3src_subreg_nr(struct brw_reg reg)693 get_3src_subreg_nr(struct brw_reg reg)
694 {
695    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
696     * use 32-bit units (components 0..7).  Since they only support F/D/UD
697     * types, this doesn't lose any flexibility, but uses fewer bits.
698     */
699    return reg.subnr / 4;
700 }
701 
702 static enum gen10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct gen_device_info * devinfo,enum brw_vertical_stride vstride)703 to_3src_align1_vstride(const struct gen_device_info *devinfo,
704                        enum brw_vertical_stride vstride)
705 {
706    switch (vstride) {
707    case BRW_VERTICAL_STRIDE_0:
708       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
709    case BRW_VERTICAL_STRIDE_1:
710       assert(devinfo->gen >= 12);
711       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
712    case BRW_VERTICAL_STRIDE_2:
713       assert(devinfo->gen < 12);
714       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
715    case BRW_VERTICAL_STRIDE_4:
716       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
717    case BRW_VERTICAL_STRIDE_8:
718    case BRW_VERTICAL_STRIDE_16:
719       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
720    default:
721       unreachable("invalid vstride");
722    }
723 }
724 
725 
726 static enum gen10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)727 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
728 {
729    switch (hstride) {
730    case BRW_HORIZONTAL_STRIDE_0:
731       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
732    case BRW_HORIZONTAL_STRIDE_1:
733       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
734    case BRW_HORIZONTAL_STRIDE_2:
735       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
736    case BRW_HORIZONTAL_STRIDE_4:
737       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
738    default:
739       unreachable("invalid hstride");
740    }
741 }
742 
743 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)744 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
745          struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
746 {
747    const struct gen_device_info *devinfo = p->devinfo;
748    brw_inst *inst = next_insn(p, opcode);
749 
750    gen7_convert_mrf_to_grf(p, &dest);
751 
752    assert(dest.nr < 128);
753 
754    if (devinfo->gen >= 10)
755       assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
756                src2.file == BRW_IMMEDIATE_VALUE));
757 
758    assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
759    assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
760    assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
761    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
762    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
763    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
764    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
765 
766    if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
767       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
768              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
769 
770       if (devinfo->gen >= 12) {
771          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
772          brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
773       } else {
774          if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
775             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
776                                               BRW_ALIGN1_3SRC_ACCUMULATOR);
777             brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
778          } else {
779             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
780                                               BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
781             brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
782          }
783       }
784       brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
785 
786       brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
787 
788       if (brw_reg_type_is_floating_point(dest.type)) {
789          brw_inst_set_3src_a1_exec_type(devinfo, inst,
790                                         BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
791       } else {
792          brw_inst_set_3src_a1_exec_type(devinfo, inst,
793                                         BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
794       }
795 
796       brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
797       brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
798       brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
799       brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
800 
801       if (src0.file == BRW_IMMEDIATE_VALUE) {
802          brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
803       } else {
804          brw_inst_set_3src_a1_src0_vstride(
805             devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
806          brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
807                                            to_3src_align1_hstride(src0.hstride));
808          brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
809          if (src0.type == BRW_REGISTER_TYPE_NF) {
810             brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
811          } else {
812             brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
813          }
814          brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
815          brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
816       }
817       brw_inst_set_3src_a1_src1_vstride(
818          devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
819       brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
820                                         to_3src_align1_hstride(src1.hstride));
821 
822       brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
823       if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
824          brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
825       } else {
826          brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
827       }
828       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
829       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
830 
831       if (src2.file == BRW_IMMEDIATE_VALUE) {
832          brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
833       } else {
834          brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
835                                            to_3src_align1_hstride(src2.hstride));
836          /* no vstride on src2 */
837          brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
838          brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
839          brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
840          brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
841       }
842 
843       assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
844              src0.file == BRW_IMMEDIATE_VALUE ||
845              (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
846               src0.type == BRW_REGISTER_TYPE_NF));
847       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
848              src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
849       assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
850              src2.file == BRW_IMMEDIATE_VALUE);
851 
852       if (devinfo->gen >= 12) {
853          if (src0.file == BRW_IMMEDIATE_VALUE) {
854             brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
855          } else {
856             brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
857          }
858 
859          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
860 
861          if (src2.file == BRW_IMMEDIATE_VALUE) {
862             brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
863          } else {
864             brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
865          }
866       } else {
867          brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
868                                             src0.file == BRW_GENERAL_REGISTER_FILE ?
869                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
870                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
871          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
872                                             src1.file == BRW_GENERAL_REGISTER_FILE ?
873                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
874                                             BRW_ALIGN1_3SRC_ACCUMULATOR);
875          brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
876                                             src2.file == BRW_GENERAL_REGISTER_FILE ?
877                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
878                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
879       }
880 
881    } else {
882       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
883              dest.file == BRW_MESSAGE_REGISTER_FILE);
884       assert(dest.type == BRW_REGISTER_TYPE_F  ||
885              dest.type == BRW_REGISTER_TYPE_DF ||
886              dest.type == BRW_REGISTER_TYPE_D  ||
887              dest.type == BRW_REGISTER_TYPE_UD ||
888              (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
889       if (devinfo->gen == 6) {
890          brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
891                                             dest.file == BRW_MESSAGE_REGISTER_FILE);
892       }
893       brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
894       brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
895       brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
896 
897       assert(src0.file == BRW_GENERAL_REGISTER_FILE);
898       brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
899       brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
900       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
901       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
902       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
903       brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
904                                           src0.vstride == BRW_VERTICAL_STRIDE_0);
905 
906       assert(src1.file == BRW_GENERAL_REGISTER_FILE);
907       brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
908       brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
909       brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
910       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
911       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
912       brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
913                                           src1.vstride == BRW_VERTICAL_STRIDE_0);
914 
915       assert(src2.file == BRW_GENERAL_REGISTER_FILE);
916       brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
917       brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
918       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
919       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
920       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
921       brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
922                                           src2.vstride == BRW_VERTICAL_STRIDE_0);
923 
924       if (devinfo->gen >= 7) {
925          /* Set both the source and destination types based on dest.type,
926           * ignoring the source register types.  The MAD and LRP emitters ensure
927           * that all four types are float.  The BFE and BFI2 emitters, however,
928           * may send us mixed D and UD types and want us to ignore that and use
929           * the destination type.
930           */
931          brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
932          brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
933 
934          /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
935           *
936           *    "Three source instructions can use operands with mixed-mode
937           *     precision. When SrcType field is set to :f or :hf it defines
938           *     precision for source 0 only, and fields Src1Type and Src2Type
939           *     define precision for other source operands:
940           *
941           *     0b = :f. Single precision Float (32-bit).
942           *     1b = :hf. Half precision Float (16-bit)."
943           */
944          if (src1.type == BRW_REGISTER_TYPE_HF)
945             brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
946 
947          if (src2.type == BRW_REGISTER_TYPE_HF)
948             brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
949       }
950    }
951 
952    return inst;
953 }
954 
955 
956 /***********************************************************************
957  * Convenience routines.
958  */
959 #define ALU1(OP)					\
960 brw_inst *brw_##OP(struct brw_codegen *p,		\
961 	      struct brw_reg dest,			\
962 	      struct brw_reg src0)   			\
963 {							\
964    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
965 }
966 
967 #define ALU2(OP)					\
968 brw_inst *brw_##OP(struct brw_codegen *p,		\
969 	      struct brw_reg dest,			\
970 	      struct brw_reg src0,			\
971 	      struct brw_reg src1)   			\
972 {							\
973    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
974 }
975 
976 #define ALU3(OP)					\
977 brw_inst *brw_##OP(struct brw_codegen *p,		\
978 	      struct brw_reg dest,			\
979 	      struct brw_reg src0,			\
980 	      struct brw_reg src1,			\
981 	      struct brw_reg src2)   			\
982 {                                                       \
983    if (p->current->access_mode == BRW_ALIGN_16) {       \
984       if (src0.vstride == BRW_VERTICAL_STRIDE_0)        \
985          src0.swizzle = BRW_SWIZZLE_XXXX;               \
986       if (src1.vstride == BRW_VERTICAL_STRIDE_0)        \
987          src1.swizzle = BRW_SWIZZLE_XXXX;               \
988       if (src2.vstride == BRW_VERTICAL_STRIDE_0)        \
989          src2.swizzle = BRW_SWIZZLE_XXXX;               \
990    }                                                    \
991    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
992 }
993 
994 #define ALU3F(OP)                                               \
995 brw_inst *brw_##OP(struct brw_codegen *p,         \
996                                  struct brw_reg dest,           \
997                                  struct brw_reg src0,           \
998                                  struct brw_reg src1,           \
999                                  struct brw_reg src2)           \
1000 {                                                               \
1001    assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
1002           dest.type == BRW_REGISTER_TYPE_DF);                   \
1003    if (dest.type == BRW_REGISTER_TYPE_F) {                      \
1004       assert(src0.type == BRW_REGISTER_TYPE_F);                 \
1005       assert(src1.type == BRW_REGISTER_TYPE_F);                 \
1006       assert(src2.type == BRW_REGISTER_TYPE_F);                 \
1007    } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
1008       assert(src0.type == BRW_REGISTER_TYPE_DF);                \
1009       assert(src1.type == BRW_REGISTER_TYPE_DF);                \
1010       assert(src2.type == BRW_REGISTER_TYPE_DF);                \
1011    }                                                            \
1012                                                                 \
1013    if (p->current->access_mode == BRW_ALIGN_16) {               \
1014       if (src0.vstride == BRW_VERTICAL_STRIDE_0)                \
1015          src0.swizzle = BRW_SWIZZLE_XXXX;                       \
1016       if (src1.vstride == BRW_VERTICAL_STRIDE_0)                \
1017          src1.swizzle = BRW_SWIZZLE_XXXX;                       \
1018       if (src2.vstride == BRW_VERTICAL_STRIDE_0)                \
1019          src2.swizzle = BRW_SWIZZLE_XXXX;                       \
1020    }                                                            \
1021    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1022 }
1023 
1024 ALU2(SEL)
ALU1(NOT)1025 ALU1(NOT)
1026 ALU2(AND)
1027 ALU2(OR)
1028 ALU2(XOR)
1029 ALU2(SHR)
1030 ALU2(SHL)
1031 ALU1(DIM)
1032 ALU2(ASR)
1033 ALU2(ROL)
1034 ALU2(ROR)
1035 ALU3(CSEL)
1036 ALU1(FRC)
1037 ALU1(RNDD)
1038 ALU1(RNDE)
1039 ALU1(RNDU)
1040 ALU1(RNDZ)
1041 ALU2(MAC)
1042 ALU2(MACH)
1043 ALU1(LZD)
1044 ALU2(DP4)
1045 ALU2(DPH)
1046 ALU2(DP3)
1047 ALU2(DP2)
1048 ALU3(MAD)
1049 ALU3F(LRP)
1050 ALU1(BFREV)
1051 ALU3(BFE)
1052 ALU2(BFI1)
1053 ALU3(BFI2)
1054 ALU1(FBH)
1055 ALU1(FBL)
1056 ALU1(CBIT)
1057 ALU2(ADDC)
1058 ALU2(SUBB)
1059 
1060 brw_inst *
1061 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1062 {
1063    const struct gen_device_info *devinfo = p->devinfo;
1064 
1065    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1066     * To avoid the problems that causes, we use an <X,2,0> source region to
1067     * read each element twice.
1068     */
1069    if (devinfo->gen == 7 && !devinfo->is_haswell &&
1070        brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1071        dest.type == BRW_REGISTER_TYPE_DF &&
1072        (src0.type == BRW_REGISTER_TYPE_F ||
1073         src0.type == BRW_REGISTER_TYPE_D ||
1074         src0.type == BRW_REGISTER_TYPE_UD) &&
1075        !has_scalar_region(src0)) {
1076       assert(src0.vstride == src0.width + src0.hstride);
1077       src0.vstride = src0.hstride;
1078       src0.width = BRW_WIDTH_2;
1079       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1080    }
1081 
1082    return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1083 }
1084 
1085 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1086 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1087         struct brw_reg src0, struct brw_reg src1)
1088 {
1089    /* 6.2.2: add */
1090    if (src0.type == BRW_REGISTER_TYPE_F ||
1091        (src0.file == BRW_IMMEDIATE_VALUE &&
1092 	src0.type == BRW_REGISTER_TYPE_VF)) {
1093       assert(src1.type != BRW_REGISTER_TYPE_UD);
1094       assert(src1.type != BRW_REGISTER_TYPE_D);
1095    }
1096 
1097    if (src1.type == BRW_REGISTER_TYPE_F ||
1098        (src1.file == BRW_IMMEDIATE_VALUE &&
1099 	src1.type == BRW_REGISTER_TYPE_VF)) {
1100       assert(src0.type != BRW_REGISTER_TYPE_UD);
1101       assert(src0.type != BRW_REGISTER_TYPE_D);
1102    }
1103 
1104    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1105 }
1106 
1107 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1108 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1109         struct brw_reg src0, struct brw_reg src1)
1110 {
1111    assert(dest.type == src0.type);
1112    assert(src0.type == src1.type);
1113    switch (src0.type) {
1114    case BRW_REGISTER_TYPE_B:
1115    case BRW_REGISTER_TYPE_UB:
1116    case BRW_REGISTER_TYPE_W:
1117    case BRW_REGISTER_TYPE_UW:
1118    case BRW_REGISTER_TYPE_D:
1119    case BRW_REGISTER_TYPE_UD:
1120       break;
1121    default:
1122       unreachable("Bad type for brw_AVG");
1123    }
1124 
1125    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1126 }
1127 
1128 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1129 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1130         struct brw_reg src0, struct brw_reg src1)
1131 {
1132    /* 6.32.38: mul */
1133    if (src0.type == BRW_REGISTER_TYPE_D ||
1134        src0.type == BRW_REGISTER_TYPE_UD ||
1135        src1.type == BRW_REGISTER_TYPE_D ||
1136        src1.type == BRW_REGISTER_TYPE_UD) {
1137       assert(dest.type != BRW_REGISTER_TYPE_F);
1138    }
1139 
1140    if (src0.type == BRW_REGISTER_TYPE_F ||
1141        (src0.file == BRW_IMMEDIATE_VALUE &&
1142 	src0.type == BRW_REGISTER_TYPE_VF)) {
1143       assert(src1.type != BRW_REGISTER_TYPE_UD);
1144       assert(src1.type != BRW_REGISTER_TYPE_D);
1145    }
1146 
1147    if (src1.type == BRW_REGISTER_TYPE_F ||
1148        (src1.file == BRW_IMMEDIATE_VALUE &&
1149 	src1.type == BRW_REGISTER_TYPE_VF)) {
1150       assert(src0.type != BRW_REGISTER_TYPE_UD);
1151       assert(src0.type != BRW_REGISTER_TYPE_D);
1152    }
1153 
1154    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1155 	  src0.nr != BRW_ARF_ACCUMULATOR);
1156    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1157 	  src1.nr != BRW_ARF_ACCUMULATOR);
1158 
1159    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1160 }
1161 
1162 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1163 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1164          struct brw_reg src0, struct brw_reg src1)
1165 {
1166    src0.vstride = BRW_VERTICAL_STRIDE_0;
1167    src0.width = BRW_WIDTH_1;
1168    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1169    return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1170 }
1171 
1172 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1173 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1174         struct brw_reg src0, struct brw_reg src1)
1175 {
1176    src0.vstride = BRW_VERTICAL_STRIDE_0;
1177    src0.width = BRW_WIDTH_1;
1178    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1179    src1.vstride = BRW_VERTICAL_STRIDE_8;
1180    src1.width = BRW_WIDTH_8;
1181    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1182    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1183 }
1184 
1185 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1186 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1187 {
1188    const struct gen_device_info *devinfo = p->devinfo;
1189    const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1190    /* The F32TO16 instruction doesn't support 32-bit destination types in
1191     * Align1 mode, and neither does the Gen8 implementation in terms of a
1192     * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1193     * an undocumented feature.
1194     */
1195    const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1196                                  (!align16 || devinfo->gen >= 8));
1197    brw_inst *inst;
1198 
1199    if (align16) {
1200       assert(dst.type == BRW_REGISTER_TYPE_UD);
1201    } else {
1202       assert(dst.type == BRW_REGISTER_TYPE_UD ||
1203              dst.type == BRW_REGISTER_TYPE_W ||
1204              dst.type == BRW_REGISTER_TYPE_UW ||
1205              dst.type == BRW_REGISTER_TYPE_HF);
1206    }
1207 
1208    brw_push_insn_state(p);
1209 
1210    if (needs_zero_fill) {
1211       brw_set_default_access_mode(p, BRW_ALIGN_1);
1212       dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1213    }
1214 
1215    if (devinfo->gen >= 8) {
1216       inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1217    } else {
1218       assert(devinfo->gen == 7);
1219       inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1220    }
1221 
1222    if (needs_zero_fill) {
1223       if (devinfo->gen < 12)
1224          brw_inst_set_no_dd_clear(devinfo, inst, true);
1225       brw_set_default_swsb(p, tgl_swsb_null());
1226       inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1227       if (devinfo->gen < 12)
1228          brw_inst_set_no_dd_check(devinfo, inst, true);
1229    }
1230 
1231    brw_pop_insn_state(p);
1232    return inst;
1233 }
1234 
1235 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1236 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1237 {
1238    const struct gen_device_info *devinfo = p->devinfo;
1239    bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1240 
1241    if (align16) {
1242       assert(src.type == BRW_REGISTER_TYPE_UD);
1243    } else {
1244       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1245        *
1246        *   Because this instruction does not have a 16-bit floating-point
1247        *   type, the source data type must be Word (W). The destination type
1248        *   must be F (Float).
1249        */
1250       if (src.type == BRW_REGISTER_TYPE_UD)
1251          src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1252 
1253       assert(src.type == BRW_REGISTER_TYPE_W ||
1254              src.type == BRW_REGISTER_TYPE_UW ||
1255              src.type == BRW_REGISTER_TYPE_HF);
1256    }
1257 
1258    if (devinfo->gen >= 8) {
1259       return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1260    } else {
1261       assert(devinfo->gen == 7);
1262       return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1263    }
1264 }
1265 
1266 
brw_NOP(struct brw_codegen * p)1267 void brw_NOP(struct brw_codegen *p)
1268 {
1269    brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1270    memset(insn, 0, sizeof(*insn));
1271    brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1272 }
1273 
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1274 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1275 {
1276    brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1277    brw_inst_set_cond_modifier(p->devinfo, insn, func);
1278 }
1279 
1280 /***********************************************************************
1281  * Comparisons, if/else/endif
1282  */
1283 
1284 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1285 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1286          unsigned predicate_control)
1287 {
1288    const struct gen_device_info *devinfo = p->devinfo;
1289    struct brw_reg ip = brw_ip_reg();
1290    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1291 
1292    brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1293    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1294    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1295    brw_inst_set_pred_control(devinfo, inst, predicate_control);
1296 
1297    return inst;
1298 }
1299 
1300 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1301 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1302 {
1303    p->if_stack[p->if_stack_depth] = inst - p->store;
1304 
1305    p->if_stack_depth++;
1306    if (p->if_stack_array_size <= p->if_stack_depth) {
1307       p->if_stack_array_size *= 2;
1308       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1309 			     p->if_stack_array_size);
1310    }
1311 }
1312 
1313 static brw_inst *
pop_if_stack(struct brw_codegen * p)1314 pop_if_stack(struct brw_codegen *p)
1315 {
1316    p->if_stack_depth--;
1317    return &p->store[p->if_stack[p->if_stack_depth]];
1318 }
1319 
1320 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1321 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1322 {
1323    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1324       p->loop_stack_array_size *= 2;
1325       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1326 			       p->loop_stack_array_size);
1327       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1328 				     p->loop_stack_array_size);
1329    }
1330 
1331    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1332    p->loop_stack_depth++;
1333    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1334 }
1335 
1336 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1337 get_inner_do_insn(struct brw_codegen *p)
1338 {
1339    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1340 }
1341 
1342 /* EU takes the value from the flag register and pushes it onto some
1343  * sort of a stack (presumably merging with any flag value already on
1344  * the stack).  Within an if block, the flags at the top of the stack
1345  * control execution on each channel of the unit, eg. on each of the
1346  * 16 pixel values in our wm programs.
1347  *
1348  * When the matching 'else' instruction is reached (presumably by
1349  * countdown of the instruction count patched in by our ELSE/ENDIF
1350  * functions), the relevant flags are inverted.
1351  *
1352  * When the matching 'endif' instruction is reached, the flags are
1353  * popped off.  If the stack is now empty, normal execution resumes.
1354  */
1355 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1356 brw_IF(struct brw_codegen *p, unsigned execute_size)
1357 {
1358    const struct gen_device_info *devinfo = p->devinfo;
1359    brw_inst *insn;
1360 
1361    insn = next_insn(p, BRW_OPCODE_IF);
1362 
1363    /* Override the defaults for this instruction:
1364     */
1365    if (devinfo->gen < 6) {
1366       brw_set_dest(p, insn, brw_ip_reg());
1367       brw_set_src0(p, insn, brw_ip_reg());
1368       brw_set_src1(p, insn, brw_imm_d(0x0));
1369    } else if (devinfo->gen == 6) {
1370       brw_set_dest(p, insn, brw_imm_w(0));
1371       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1372       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1373       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1374    } else if (devinfo->gen == 7) {
1375       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1376       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1377       brw_set_src1(p, insn, brw_imm_w(0));
1378       brw_inst_set_jip(devinfo, insn, 0);
1379       brw_inst_set_uip(devinfo, insn, 0);
1380    } else {
1381       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1382       if (devinfo->gen < 12)
1383          brw_set_src0(p, insn, brw_imm_d(0));
1384       brw_inst_set_jip(devinfo, insn, 0);
1385       brw_inst_set_uip(devinfo, insn, 0);
1386    }
1387 
1388    brw_inst_set_exec_size(devinfo, insn, execute_size);
1389    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1390    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1391    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1392    if (!p->single_program_flow && devinfo->gen < 6)
1393       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1394 
1395    push_if_stack(p, insn);
1396    p->if_depth_in_loop[p->loop_stack_depth]++;
1397    return insn;
1398 }
1399 
1400 /* This function is only used for gen6-style IF instructions with an
1401  * embedded comparison (conditional modifier).  It is not used on gen7.
1402  */
1403 brw_inst *
gen6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1404 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1405 	struct brw_reg src0, struct brw_reg src1)
1406 {
1407    const struct gen_device_info *devinfo = p->devinfo;
1408    brw_inst *insn;
1409 
1410    insn = next_insn(p, BRW_OPCODE_IF);
1411 
1412    brw_set_dest(p, insn, brw_imm_w(0));
1413    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1414    brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1415    brw_set_src0(p, insn, src0);
1416    brw_set_src1(p, insn, src1);
1417 
1418    assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1419    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1420    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1421 
1422    push_if_stack(p, insn);
1423    return insn;
1424 }
1425 
1426 /**
1427  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1428  */
1429 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1430 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1431                        brw_inst *if_inst, brw_inst *else_inst)
1432 {
1433    const struct gen_device_info *devinfo = p->devinfo;
1434 
1435    /* The next instruction (where the ENDIF would be, if it existed) */
1436    brw_inst *next_inst = &p->store[p->nr_insn];
1437 
1438    assert(p->single_program_flow);
1439    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1440    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1441    assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1442 
1443    /* Convert IF to an ADD instruction that moves the instruction pointer
1444     * to the first instruction of the ELSE block.  If there is no ELSE
1445     * block, point to where ENDIF would be.  Reverse the predicate.
1446     *
1447     * There's no need to execute an ENDIF since we don't need to do any
1448     * stack operations, and if we're currently executing, we just want to
1449     * continue normally.
1450     */
1451    brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1452    brw_inst_set_pred_inv(devinfo, if_inst, true);
1453 
1454    if (else_inst != NULL) {
1455       /* Convert ELSE to an ADD instruction that points where the ENDIF
1456        * would be.
1457        */
1458       brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1459 
1460       brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1461       brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1462    } else {
1463       brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1464    }
1465 }
1466 
1467 /**
1468  * Patch IF and ELSE instructions with appropriate jump targets.
1469  */
1470 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1471 patch_IF_ELSE(struct brw_codegen *p,
1472               brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1473 {
1474    const struct gen_device_info *devinfo = p->devinfo;
1475 
1476    /* We shouldn't be patching IF and ELSE instructions in single program flow
1477     * mode when gen < 6, because in single program flow mode on those
1478     * platforms, we convert flow control instructions to conditional ADDs that
1479     * operate on IP (see brw_ENDIF).
1480     *
1481     * However, on Gen6, writing to IP doesn't work in single program flow mode
1482     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1483     * not be updated by non-flow control instructions.").  And on later
1484     * platforms, there is no significant benefit to converting control flow
1485     * instructions to conditional ADDs.  So we do patch IF and ELSE
1486     * instructions in single program flow mode on those platforms.
1487     */
1488    if (devinfo->gen < 6)
1489       assert(!p->single_program_flow);
1490 
1491    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1492    assert(endif_inst != NULL);
1493    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1494 
1495    unsigned br = brw_jump_scale(devinfo);
1496 
1497    assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1498    brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1499 
1500    if (else_inst == NULL) {
1501       /* Patch IF -> ENDIF */
1502       if (devinfo->gen < 6) {
1503 	 /* Turn it into an IFF, which means no mask stack operations for
1504 	  * all-false and jumping past the ENDIF.
1505 	  */
1506          brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1507          brw_inst_set_gen4_jump_count(devinfo, if_inst,
1508                                       br * (endif_inst - if_inst + 1));
1509          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1510       } else if (devinfo->gen == 6) {
1511 	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1512          brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1513       } else {
1514          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1515          brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1516       }
1517    } else {
1518       brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1519 
1520       /* Patch IF -> ELSE */
1521       if (devinfo->gen < 6) {
1522          brw_inst_set_gen4_jump_count(devinfo, if_inst,
1523                                       br * (else_inst - if_inst));
1524          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1525       } else if (devinfo->gen == 6) {
1526          brw_inst_set_gen6_jump_count(devinfo, if_inst,
1527                                       br * (else_inst - if_inst + 1));
1528       }
1529 
1530       /* Patch ELSE -> ENDIF */
1531       if (devinfo->gen < 6) {
1532 	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1533 	  * matching ENDIF.
1534 	  */
1535          brw_inst_set_gen4_jump_count(devinfo, else_inst,
1536                                       br * (endif_inst - else_inst + 1));
1537          brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1538       } else if (devinfo->gen == 6) {
1539 	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1540          brw_inst_set_gen6_jump_count(devinfo, else_inst,
1541                                       br * (endif_inst - else_inst));
1542       } else {
1543 	 /* The IF instruction's JIP should point just past the ELSE */
1544          brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1545 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1546          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1547          brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1548          if (devinfo->gen >= 8) {
1549             /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1550              * should point to ENDIF.
1551              */
1552             brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1553          }
1554       }
1555    }
1556 }
1557 
1558 void
brw_ELSE(struct brw_codegen * p)1559 brw_ELSE(struct brw_codegen *p)
1560 {
1561    const struct gen_device_info *devinfo = p->devinfo;
1562    brw_inst *insn;
1563 
1564    insn = next_insn(p, BRW_OPCODE_ELSE);
1565 
1566    if (devinfo->gen < 6) {
1567       brw_set_dest(p, insn, brw_ip_reg());
1568       brw_set_src0(p, insn, brw_ip_reg());
1569       brw_set_src1(p, insn, brw_imm_d(0x0));
1570    } else if (devinfo->gen == 6) {
1571       brw_set_dest(p, insn, brw_imm_w(0));
1572       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1573       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1574       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1575    } else if (devinfo->gen == 7) {
1576       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1577       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1578       brw_set_src1(p, insn, brw_imm_w(0));
1579       brw_inst_set_jip(devinfo, insn, 0);
1580       brw_inst_set_uip(devinfo, insn, 0);
1581    } else {
1582       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1583       if (devinfo->gen < 12)
1584          brw_set_src0(p, insn, brw_imm_d(0));
1585       brw_inst_set_jip(devinfo, insn, 0);
1586       brw_inst_set_uip(devinfo, insn, 0);
1587    }
1588 
1589    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1590    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1591    if (!p->single_program_flow && devinfo->gen < 6)
1592       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1593 
1594    push_if_stack(p, insn);
1595 }
1596 
1597 void
brw_ENDIF(struct brw_codegen * p)1598 brw_ENDIF(struct brw_codegen *p)
1599 {
1600    const struct gen_device_info *devinfo = p->devinfo;
1601    brw_inst *insn = NULL;
1602    brw_inst *else_inst = NULL;
1603    brw_inst *if_inst = NULL;
1604    brw_inst *tmp;
1605    bool emit_endif = true;
1606 
1607    /* In single program flow mode, we can express IF and ELSE instructions
1608     * equivalently as ADD instructions that operate on IP.  On platforms prior
1609     * to Gen6, flow control instructions cause an implied thread switch, so
1610     * this is a significant savings.
1611     *
1612     * However, on Gen6, writing to IP doesn't work in single program flow mode
1613     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1614     * not be updated by non-flow control instructions.").  And on later
1615     * platforms, there is no significant benefit to converting control flow
1616     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1617     * Gen5.
1618     */
1619    if (devinfo->gen < 6 && p->single_program_flow)
1620       emit_endif = false;
1621 
1622    /*
1623     * A single next_insn() may change the base address of instruction store
1624     * memory(p->store), so call it first before referencing the instruction
1625     * store pointer from an index
1626     */
1627    if (emit_endif)
1628       insn = next_insn(p, BRW_OPCODE_ENDIF);
1629 
1630    /* Pop the IF and (optional) ELSE instructions from the stack */
1631    p->if_depth_in_loop[p->loop_stack_depth]--;
1632    tmp = pop_if_stack(p);
1633    if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1634       else_inst = tmp;
1635       tmp = pop_if_stack(p);
1636    }
1637    if_inst = tmp;
1638 
1639    if (!emit_endif) {
1640       /* ENDIF is useless; don't bother emitting it. */
1641       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1642       return;
1643    }
1644 
1645    if (devinfo->gen < 6) {
1646       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1647       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648       brw_set_src1(p, insn, brw_imm_d(0x0));
1649    } else if (devinfo->gen == 6) {
1650       brw_set_dest(p, insn, brw_imm_w(0));
1651       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1652       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1653    } else if (devinfo->gen == 7) {
1654       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1655       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1656       brw_set_src1(p, insn, brw_imm_w(0));
1657    } else {
1658       brw_set_src0(p, insn, brw_imm_d(0));
1659    }
1660 
1661    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1662    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1663    if (devinfo->gen < 6)
1664       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1665 
1666    /* Also pop item off the stack in the endif instruction: */
1667    if (devinfo->gen < 6) {
1668       brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1669       brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1670    } else if (devinfo->gen == 6) {
1671       brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1672    } else {
1673       brw_inst_set_jip(devinfo, insn, 2);
1674    }
1675    patch_IF_ELSE(p, if_inst, else_inst, insn);
1676 }
1677 
1678 brw_inst *
brw_BREAK(struct brw_codegen * p)1679 brw_BREAK(struct brw_codegen *p)
1680 {
1681    const struct gen_device_info *devinfo = p->devinfo;
1682    brw_inst *insn;
1683 
1684    insn = next_insn(p, BRW_OPCODE_BREAK);
1685    if (devinfo->gen >= 8) {
1686       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1687       brw_set_src0(p, insn, brw_imm_d(0x0));
1688    } else if (devinfo->gen >= 6) {
1689       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1690       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1691       brw_set_src1(p, insn, brw_imm_d(0x0));
1692    } else {
1693       brw_set_dest(p, insn, brw_ip_reg());
1694       brw_set_src0(p, insn, brw_ip_reg());
1695       brw_set_src1(p, insn, brw_imm_d(0x0));
1696       brw_inst_set_gen4_pop_count(devinfo, insn,
1697                                   p->if_depth_in_loop[p->loop_stack_depth]);
1698    }
1699    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1700    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1701 
1702    return insn;
1703 }
1704 
1705 brw_inst *
brw_CONT(struct brw_codegen * p)1706 brw_CONT(struct brw_codegen *p)
1707 {
1708    const struct gen_device_info *devinfo = p->devinfo;
1709    brw_inst *insn;
1710 
1711    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1712    brw_set_dest(p, insn, brw_ip_reg());
1713    if (devinfo->gen >= 8) {
1714       brw_set_src0(p, insn, brw_imm_d(0x0));
1715    } else {
1716       brw_set_src0(p, insn, brw_ip_reg());
1717       brw_set_src1(p, insn, brw_imm_d(0x0));
1718    }
1719 
1720    if (devinfo->gen < 6) {
1721       brw_inst_set_gen4_pop_count(devinfo, insn,
1722                                   p->if_depth_in_loop[p->loop_stack_depth]);
1723    }
1724    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1725    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1726    return insn;
1727 }
1728 
1729 brw_inst *
brw_HALT(struct brw_codegen * p)1730 brw_HALT(struct brw_codegen *p)
1731 {
1732    const struct gen_device_info *devinfo = p->devinfo;
1733    brw_inst *insn;
1734 
1735    insn = next_insn(p, BRW_OPCODE_HALT);
1736    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1737    if (devinfo->gen < 6) {
1738       /* From the Gen4 PRM:
1739        *
1740        *    "IP register must be put (for example, by the assembler) at <dst>
1741        *    and <src0> locations.
1742        */
1743       brw_set_dest(p, insn, brw_ip_reg());
1744       brw_set_src0(p, insn, brw_ip_reg());
1745       brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1746    } else if (devinfo->gen < 8) {
1747       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1748       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1749    } else if (devinfo->gen < 12) {
1750       brw_set_src0(p, insn, brw_imm_d(0x0));
1751    }
1752 
1753    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1754    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1755    return insn;
1756 }
1757 
1758 /* DO/WHILE loop:
1759  *
1760  * The DO/WHILE is just an unterminated loop -- break or continue are
1761  * used for control within the loop.  We have a few ways they can be
1762  * done.
1763  *
1764  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1765  * jip and no DO instruction.
1766  *
1767  * For non-uniform control flow pre-gen6, there's a DO instruction to
1768  * push the mask, and a WHILE to jump back, and BREAK to get out and
1769  * pop the mask.
1770  *
1771  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1772  * just points back to the first instruction of the loop.
1773  */
1774 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1775 brw_DO(struct brw_codegen *p, unsigned execute_size)
1776 {
1777    const struct gen_device_info *devinfo = p->devinfo;
1778 
1779    if (devinfo->gen >= 6 || p->single_program_flow) {
1780       push_loop_stack(p, &p->store[p->nr_insn]);
1781       return &p->store[p->nr_insn];
1782    } else {
1783       brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1784 
1785       push_loop_stack(p, insn);
1786 
1787       /* Override the defaults for this instruction:
1788        */
1789       brw_set_dest(p, insn, brw_null_reg());
1790       brw_set_src0(p, insn, brw_null_reg());
1791       brw_set_src1(p, insn, brw_null_reg());
1792 
1793       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1794       brw_inst_set_exec_size(devinfo, insn, execute_size);
1795       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1796 
1797       return insn;
1798    }
1799 }
1800 
1801 /**
1802  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1803  * instruction here.
1804  *
1805  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1806  * nesting, since it can always just point to the end of the block/current loop.
1807  */
1808 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1809 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1810 {
1811    const struct gen_device_info *devinfo = p->devinfo;
1812    brw_inst *do_inst = get_inner_do_insn(p);
1813    brw_inst *inst;
1814    unsigned br = brw_jump_scale(devinfo);
1815 
1816    assert(devinfo->gen < 6);
1817 
1818    for (inst = while_inst - 1; inst != do_inst; inst--) {
1819       /* If the jump count is != 0, that means that this instruction has already
1820        * been patched because it's part of a loop inside of the one we're
1821        * patching.
1822        */
1823       if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1824           brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1825          brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1826       } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1827                  brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1828          brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1829       }
1830    }
1831 }
1832 
1833 brw_inst *
brw_WHILE(struct brw_codegen * p)1834 brw_WHILE(struct brw_codegen *p)
1835 {
1836    const struct gen_device_info *devinfo = p->devinfo;
1837    brw_inst *insn, *do_insn;
1838    unsigned br = brw_jump_scale(devinfo);
1839 
1840    if (devinfo->gen >= 6) {
1841       insn = next_insn(p, BRW_OPCODE_WHILE);
1842       do_insn = get_inner_do_insn(p);
1843 
1844       if (devinfo->gen >= 8) {
1845          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1846          if (devinfo->gen < 12)
1847             brw_set_src0(p, insn, brw_imm_d(0));
1848          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1849       } else if (devinfo->gen == 7) {
1850          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1851          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1852          brw_set_src1(p, insn, brw_imm_w(0));
1853          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1854       } else {
1855          brw_set_dest(p, insn, brw_imm_w(0));
1856          brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1857          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1858          brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1859       }
1860 
1861       brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1862 
1863    } else {
1864       if (p->single_program_flow) {
1865 	 insn = next_insn(p, BRW_OPCODE_ADD);
1866          do_insn = get_inner_do_insn(p);
1867 
1868 	 brw_set_dest(p, insn, brw_ip_reg());
1869 	 brw_set_src0(p, insn, brw_ip_reg());
1870 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1871          brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1872       } else {
1873 	 insn = next_insn(p, BRW_OPCODE_WHILE);
1874          do_insn = get_inner_do_insn(p);
1875 
1876          assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1877 
1878 	 brw_set_dest(p, insn, brw_ip_reg());
1879 	 brw_set_src0(p, insn, brw_ip_reg());
1880 	 brw_set_src1(p, insn, brw_imm_d(0));
1881 
1882          brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1883          brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1884          brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1885 
1886 	 brw_patch_break_cont(p, insn);
1887       }
1888    }
1889    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1890 
1891    p->loop_stack_depth--;
1892 
1893    return insn;
1894 }
1895 
1896 /* FORWARD JUMPS:
1897  */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1898 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1899 {
1900    const struct gen_device_info *devinfo = p->devinfo;
1901    brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1902    unsigned jmpi = 1;
1903 
1904    if (devinfo->gen >= 5)
1905       jmpi = 2;
1906 
1907    assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1908    assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1909 
1910    brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1911                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1912 }
1913 
1914 /* To integrate with the above, it makes sense that the comparison
1915  * instruction should populate the flag register.  It might be simpler
1916  * just to use the flag reg for most WM tasks?
1917  */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1918 void brw_CMP(struct brw_codegen *p,
1919 	     struct brw_reg dest,
1920 	     unsigned conditional,
1921 	     struct brw_reg src0,
1922 	     struct brw_reg src1)
1923 {
1924    const struct gen_device_info *devinfo = p->devinfo;
1925    brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1926 
1927    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1928    brw_set_dest(p, insn, dest);
1929    brw_set_src0(p, insn, src0);
1930    brw_set_src1(p, insn, src1);
1931 
1932    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1933     * page says:
1934     *    "Any CMP instruction with a null destination must use a {switch}."
1935     *
1936     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1937     * mentioned on their work-arounds pages.
1938     */
1939    if (devinfo->gen == 7) {
1940       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1941           dest.nr == BRW_ARF_NULL) {
1942          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1943       }
1944    }
1945 }
1946 
1947 /***********************************************************************
1948  * Helpers for the various SEND message types:
1949  */
1950 
1951 /** Extended math function, float[8].
1952  */
gen4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)1953 void gen4_math(struct brw_codegen *p,
1954 	       struct brw_reg dest,
1955 	       unsigned function,
1956 	       unsigned msg_reg_nr,
1957 	       struct brw_reg src,
1958 	       unsigned precision )
1959 {
1960    const struct gen_device_info *devinfo = p->devinfo;
1961    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1962    unsigned data_type;
1963    if (has_scalar_region(src)) {
1964       data_type = BRW_MATH_DATA_SCALAR;
1965    } else {
1966       data_type = BRW_MATH_DATA_VECTOR;
1967    }
1968 
1969    assert(devinfo->gen < 6);
1970 
1971    /* Example code doesn't set predicate_control for send
1972     * instructions.
1973     */
1974    brw_inst_set_pred_control(devinfo, insn, 0);
1975    brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1976 
1977    brw_set_dest(p, insn, dest);
1978    brw_set_src0(p, insn, src);
1979    brw_set_math_message(p,
1980                         insn,
1981                         function,
1982                         src.type == BRW_REGISTER_TYPE_D,
1983                         precision,
1984                         data_type);
1985 }
1986 
gen6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)1987 void gen6_math(struct brw_codegen *p,
1988 	       struct brw_reg dest,
1989 	       unsigned function,
1990 	       struct brw_reg src0,
1991 	       struct brw_reg src1)
1992 {
1993    const struct gen_device_info *devinfo = p->devinfo;
1994    brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1995 
1996    assert(devinfo->gen >= 6);
1997 
1998    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1999           (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2000 
2001    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2002    if (devinfo->gen == 6) {
2003       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2004       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2005    }
2006 
2007    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2008        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2009        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2010       assert(src0.type != BRW_REGISTER_TYPE_F);
2011       assert(src1.type != BRW_REGISTER_TYPE_F);
2012       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2013              (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2014    } else {
2015       assert(src0.type == BRW_REGISTER_TYPE_F ||
2016              (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2017       assert(src1.type == BRW_REGISTER_TYPE_F ||
2018              (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2019    }
2020 
2021    /* Source modifiers are ignored for extended math instructions on Gen6. */
2022    if (devinfo->gen == 6) {
2023       assert(!src0.negate);
2024       assert(!src0.abs);
2025       assert(!src1.negate);
2026       assert(!src1.abs);
2027    }
2028 
2029    brw_inst_set_math_function(devinfo, insn, function);
2030 
2031    brw_set_dest(p, insn, dest);
2032    brw_set_src0(p, insn, src0);
2033    brw_set_src1(p, insn, src1);
2034 }
2035 
2036 /**
2037  * Return the right surface index to access the thread scratch space using
2038  * stateless dataport messages.
2039  */
2040 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)2041 brw_scratch_surface_idx(const struct brw_codegen *p)
2042 {
2043    /* The scratch space is thread-local so IA coherency is unnecessary. */
2044    if (p->devinfo->gen >= 8)
2045       return GEN8_BTI_STATELESS_NON_COHERENT;
2046    else
2047       return BRW_BTI_STATELESS;
2048 }
2049 
2050 /**
2051  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2052  * using a constant offset per channel.
2053  *
2054  * The offset must be aligned to oword size (16 bytes).  Used for
2055  * register spilling.
2056  */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)2057 void brw_oword_block_write_scratch(struct brw_codegen *p,
2058 				   struct brw_reg mrf,
2059 				   int num_regs,
2060 				   unsigned offset)
2061 {
2062    const struct gen_device_info *devinfo = p->devinfo;
2063    const unsigned target_cache =
2064       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2065        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2066        BRW_SFID_DATAPORT_WRITE);
2067    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2068    uint32_t msg_type;
2069 
2070    if (devinfo->gen >= 6)
2071       offset /= 16;
2072 
2073    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2074 
2075    const unsigned mlen = 1 + num_regs;
2076 
2077    /* Set up the message header.  This is g0, with g0.2 filled with
2078     * the offset.  We don't want to leave our offset around in g0 or
2079     * it'll screw up texture samples, so set it up inside the message
2080     * reg.
2081     */
2082    {
2083       brw_push_insn_state(p);
2084       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2085       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2086       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2087       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2088 
2089       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2090 
2091       /* set message header global offset field (reg 0, element 2) */
2092       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2093       brw_set_default_swsb(p, tgl_swsb_null());
2094       brw_MOV(p,
2095 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2096 				  mrf.nr,
2097 				  2), BRW_REGISTER_TYPE_UD),
2098 	      brw_imm_ud(offset));
2099 
2100       brw_pop_insn_state(p);
2101       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2102    }
2103 
2104    {
2105       struct brw_reg dest;
2106       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2107       int send_commit_msg;
2108       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2109 					 BRW_REGISTER_TYPE_UW);
2110 
2111       brw_inst_set_sfid(devinfo, insn, target_cache);
2112       brw_inst_set_compression(devinfo, insn, false);
2113 
2114       if (brw_inst_exec_size(devinfo, insn) >= 16)
2115 	 src_header = vec16(src_header);
2116 
2117       assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2118       if (devinfo->gen < 6)
2119          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2120 
2121       /* Until gen6, writes followed by reads from the same location
2122        * are not guaranteed to be ordered unless write_commit is set.
2123        * If set, then a no-op write is issued to the destination
2124        * register to set a dependency, and a read from the destination
2125        * can be used to ensure the ordering.
2126        *
2127        * For gen6, only writes between different threads need ordering
2128        * protection.  Our use of DP writes is all about register
2129        * spilling within a thread.
2130        */
2131       if (devinfo->gen >= 6) {
2132 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2133 	 send_commit_msg = 0;
2134       } else {
2135 	 dest = src_header;
2136 	 send_commit_msg = 1;
2137       }
2138 
2139       brw_set_dest(p, insn, dest);
2140       if (devinfo->gen >= 6) {
2141 	 brw_set_src0(p, insn, mrf);
2142       } else {
2143 	 brw_set_src0(p, insn, brw_null_reg());
2144       }
2145 
2146       if (devinfo->gen >= 6)
2147 	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2148       else
2149 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2150 
2151       brw_set_desc(p, insn,
2152                    brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2153                    brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2154                                      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2155                                      msg_type, 0, /* not a render target */
2156                                      send_commit_msg));
2157    }
2158 }
2159 
2160 
2161 /**
2162  * Read a block of owords (half a GRF each) from the scratch buffer
2163  * using a constant index per channel.
2164  *
2165  * Offset must be aligned to oword size (16 bytes).  Used for register
2166  * spilling.
2167  */
2168 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2169 brw_oword_block_read_scratch(struct brw_codegen *p,
2170 			     struct brw_reg dest,
2171 			     struct brw_reg mrf,
2172 			     int num_regs,
2173 			     unsigned offset)
2174 {
2175    const struct gen_device_info *devinfo = p->devinfo;
2176    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2177 
2178    if (devinfo->gen >= 6)
2179       offset /= 16;
2180 
2181    if (p->devinfo->gen >= 7) {
2182       /* On gen 7 and above, we no longer have message registers and we can
2183        * send from any register we want.  By using the destination register
2184        * for the message, we guarantee that the implied message write won't
2185        * accidentally overwrite anything.  This has been a problem because
2186        * the MRF registers and source for the final FB write are both fixed
2187        * and may overlap.
2188        */
2189       mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2190    } else {
2191       mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2192    }
2193    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2194 
2195    const unsigned rlen = num_regs;
2196    const unsigned target_cache =
2197       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2198        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2199        BRW_SFID_DATAPORT_READ);
2200 
2201    {
2202       brw_push_insn_state(p);
2203       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2204       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2205       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2206       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2207 
2208       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2209 
2210       /* set message header global offset field (reg 0, element 2) */
2211       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2212       brw_set_default_swsb(p, tgl_swsb_null());
2213       brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2214 
2215       brw_pop_insn_state(p);
2216       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2217    }
2218 
2219    {
2220       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2221 
2222       brw_inst_set_sfid(devinfo, insn, target_cache);
2223       assert(brw_inst_pred_control(devinfo, insn) == 0);
2224       brw_inst_set_compression(devinfo, insn, false);
2225 
2226       brw_set_dest(p, insn, dest);	/* UW? */
2227       if (devinfo->gen >= 6) {
2228 	 brw_set_src0(p, insn, mrf);
2229       } else {
2230 	 brw_set_src0(p, insn, brw_null_reg());
2231          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2232       }
2233 
2234       brw_set_desc(p, insn,
2235                    brw_message_desc(devinfo, 1, rlen, true) |
2236                    brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2237                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2238                                     BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2239                                     BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2240    }
2241 }
2242 
2243 void
gen7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2244 gen7_block_read_scratch(struct brw_codegen *p,
2245                         struct brw_reg dest,
2246                         int num_regs,
2247                         unsigned offset)
2248 {
2249    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2250    assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2251 
2252    brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2253 
2254    /* The HW requires that the header is present; this is to get the g0.5
2255     * scratch offset.
2256     */
2257    brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2258 
2259    /* According to the docs, offset is "A 12-bit HWord offset into the memory
2260     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2261     * is 32 bytes, which happens to be the size of a register.
2262     */
2263    offset /= REG_SIZE;
2264    assert(offset < (1 << 12));
2265 
2266    gen7_set_dp_scratch_message(p, insn,
2267                                false, /* scratch read */
2268                                false, /* OWords */
2269                                false, /* invalidate after read */
2270                                num_regs,
2271                                offset,
2272                                1,        /* mlen: just g0 */
2273                                num_regs, /* rlen */
2274                                true);    /* header present */
2275 }
2276 
2277 /**
2278  * Read float[4] vectors from the data port constant cache.
2279  * Location (in buffer) should be a multiple of 16.
2280  * Used for fetching shader constants.
2281  */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2282 void brw_oword_block_read(struct brw_codegen *p,
2283 			  struct brw_reg dest,
2284 			  struct brw_reg mrf,
2285 			  uint32_t offset,
2286 			  uint32_t bind_table_index)
2287 {
2288    const struct gen_device_info *devinfo = p->devinfo;
2289    const unsigned target_cache =
2290       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2291        BRW_SFID_DATAPORT_READ);
2292    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2293    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2294 
2295    /* On newer hardware, offset is in units of owords. */
2296    if (devinfo->gen >= 6)
2297       offset /= 16;
2298 
2299    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2300 
2301    brw_push_insn_state(p);
2302    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2303    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2304    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2305 
2306    brw_push_insn_state(p);
2307    brw_set_default_exec_size(p, BRW_EXECUTE_8);
2308    brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2309    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2310 
2311    /* set message header global offset field (reg 0, element 2) */
2312    brw_set_default_exec_size(p, BRW_EXECUTE_1);
2313    brw_set_default_swsb(p, tgl_swsb_null());
2314    brw_MOV(p,
2315 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2316 			       mrf.nr,
2317 			       2), BRW_REGISTER_TYPE_UD),
2318 	   brw_imm_ud(offset));
2319    brw_pop_insn_state(p);
2320 
2321    brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2322 
2323    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2324 
2325    brw_inst_set_sfid(devinfo, insn, target_cache);
2326 
2327    /* cast dest to a uword[8] vector */
2328    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2329 
2330    brw_set_dest(p, insn, dest);
2331    if (devinfo->gen >= 6) {
2332       brw_set_src0(p, insn, mrf);
2333    } else {
2334       brw_set_src0(p, insn, brw_null_reg());
2335       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2336    }
2337 
2338    brw_set_desc(p, insn,
2339                 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2340                 brw_dp_read_desc(devinfo, bind_table_index,
2341                                  BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2342                                  BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2343                                  BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2344 
2345    brw_pop_insn_state(p);
2346 }
2347 
2348 brw_inst *
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2349 brw_fb_WRITE(struct brw_codegen *p,
2350              struct brw_reg payload,
2351              struct brw_reg implied_header,
2352              unsigned msg_control,
2353              unsigned binding_table_index,
2354              unsigned msg_length,
2355              unsigned response_length,
2356              bool eot,
2357              bool last_render_target,
2358              bool header_present)
2359 {
2360    const struct gen_device_info *devinfo = p->devinfo;
2361    const unsigned target_cache =
2362       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2363        BRW_SFID_DATAPORT_WRITE);
2364    brw_inst *insn;
2365    unsigned msg_type;
2366    struct brw_reg dest, src0;
2367 
2368    if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2369       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2370    else
2371       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2372 
2373    if (devinfo->gen >= 6) {
2374       insn = next_insn(p, BRW_OPCODE_SENDC);
2375    } else {
2376       insn = next_insn(p, BRW_OPCODE_SEND);
2377    }
2378    brw_inst_set_sfid(devinfo, insn, target_cache);
2379    brw_inst_set_compression(devinfo, insn, false);
2380 
2381    if (devinfo->gen >= 6) {
2382       /* headerless version, just submit color payload */
2383       src0 = payload;
2384 
2385       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2386    } else {
2387       assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2388       brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2389       src0 = implied_header;
2390 
2391       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2392    }
2393 
2394    brw_set_dest(p, insn, dest);
2395    brw_set_src0(p, insn, src0);
2396    brw_set_desc(p, insn,
2397                 brw_message_desc(devinfo, msg_length, response_length,
2398                                  header_present) |
2399                 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2400                                   msg_type, last_render_target,
2401                                   0 /* send_commit_msg */));
2402    brw_inst_set_eot(devinfo, insn, eot);
2403 
2404    return insn;
2405 }
2406 
2407 brw_inst *
gen9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2408 gen9_fb_READ(struct brw_codegen *p,
2409              struct brw_reg dst,
2410              struct brw_reg payload,
2411              unsigned binding_table_index,
2412              unsigned msg_length,
2413              unsigned response_length,
2414              bool per_sample)
2415 {
2416    const struct gen_device_info *devinfo = p->devinfo;
2417    assert(devinfo->gen >= 9);
2418    const unsigned msg_subtype =
2419       brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2420    brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2421 
2422    brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2423    brw_set_dest(p, insn, dst);
2424    brw_set_src0(p, insn, payload);
2425    brw_set_desc(
2426       p, insn,
2427       brw_message_desc(devinfo, msg_length, response_length, true) |
2428       brw_dp_read_desc(devinfo, binding_table_index,
2429                        per_sample << 5 | msg_subtype,
2430                        GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2431                        BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2432    brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2433 
2434    return insn;
2435 }
2436 
2437 /**
2438  * Texture sample instruction.
2439  * Note: the msg_type plus msg_length values determine exactly what kind
2440  * of sampling operation is performed.  See volume 4, page 161 of docs.
2441  */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2442 void brw_SAMPLE(struct brw_codegen *p,
2443 		struct brw_reg dest,
2444 		unsigned msg_reg_nr,
2445 		struct brw_reg src0,
2446 		unsigned binding_table_index,
2447 		unsigned sampler,
2448 		unsigned msg_type,
2449 		unsigned response_length,
2450 		unsigned msg_length,
2451 		unsigned header_present,
2452 		unsigned simd_mode,
2453 		unsigned return_format)
2454 {
2455    const struct gen_device_info *devinfo = p->devinfo;
2456    brw_inst *insn;
2457 
2458    if (msg_reg_nr != -1)
2459       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2460 
2461    insn = next_insn(p, BRW_OPCODE_SEND);
2462    brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2463    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2464 
2465    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2466     *
2467     *    "Instruction compression is not allowed for this instruction (that
2468     *     is, send). The hardware behavior is undefined if this instruction is
2469     *     set as compressed. However, compress control can be set to "SecHalf"
2470     *     to affect the EMask generation."
2471     *
2472     * No similar wording is found in later PRMs, but there are examples
2473     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2474     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2475     * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2476     */
2477    brw_inst_set_compression(devinfo, insn, false);
2478 
2479    if (devinfo->gen < 6)
2480       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2481 
2482    brw_set_dest(p, insn, dest);
2483    brw_set_src0(p, insn, src0);
2484    brw_set_desc(p, insn,
2485                 brw_message_desc(devinfo, msg_length, response_length,
2486                                  header_present) |
2487                 brw_sampler_desc(devinfo, binding_table_index, sampler,
2488                                  msg_type, simd_mode, return_format));
2489 }
2490 
2491 /* Adjust the message header's sampler state pointer to
2492  * select the correct group of 16 samplers.
2493  */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2494 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2495                                       struct brw_reg header,
2496                                       struct brw_reg sampler_index)
2497 {
2498    /* The "Sampler Index" field can only store values between 0 and 15.
2499     * However, we can add an offset to the "Sampler State Pointer"
2500     * field, effectively selecting a different set of 16 samplers.
2501     *
2502     * The "Sampler State Pointer" needs to be aligned to a 32-byte
2503     * offset, and each sampler state is only 16-bytes, so we can't
2504     * exclusively use the offset - we have to use both.
2505     */
2506 
2507    const struct gen_device_info *devinfo = p->devinfo;
2508 
2509    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2510       const int sampler_state_size = 16; /* 16 bytes */
2511       uint32_t sampler = sampler_index.ud;
2512 
2513       if (sampler >= 16) {
2514          assert(devinfo->is_haswell || devinfo->gen >= 8);
2515          brw_ADD(p,
2516                  get_element_ud(header, 3),
2517                  get_element_ud(brw_vec8_grf(0, 0), 3),
2518                  brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2519       }
2520    } else {
2521       /* Non-const sampler array indexing case */
2522       if (devinfo->gen < 8 && !devinfo->is_haswell) {
2523          return;
2524       }
2525 
2526       struct brw_reg temp = get_element_ud(header, 3);
2527 
2528       brw_push_insn_state(p);
2529       brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2530       brw_set_default_swsb(p, tgl_swsb_regdist(1));
2531       brw_SHL(p, temp, temp, brw_imm_ud(4));
2532       brw_ADD(p,
2533               get_element_ud(header, 3),
2534               get_element_ud(brw_vec8_grf(0, 0), 3),
2535               temp);
2536       brw_pop_insn_state(p);
2537    }
2538 }
2539 
2540 /* All these variables are pretty confusing - we might be better off
2541  * using bitmasks and macros for this, in the old style.  Or perhaps
2542  * just having the caller instantiate the fields in dword3 itself.
2543  */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2544 void brw_urb_WRITE(struct brw_codegen *p,
2545 		   struct brw_reg dest,
2546 		   unsigned msg_reg_nr,
2547 		   struct brw_reg src0,
2548                    enum brw_urb_write_flags flags,
2549 		   unsigned msg_length,
2550 		   unsigned response_length,
2551 		   unsigned offset,
2552 		   unsigned swizzle)
2553 {
2554    const struct gen_device_info *devinfo = p->devinfo;
2555    brw_inst *insn;
2556 
2557    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2558 
2559    if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2560       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2561       brw_push_insn_state(p);
2562       brw_set_default_access_mode(p, BRW_ALIGN_1);
2563       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2564       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2565       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2566 		       BRW_REGISTER_TYPE_UD),
2567 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2568 		brw_imm_ud(0xff00));
2569       brw_pop_insn_state(p);
2570    }
2571 
2572    insn = next_insn(p, BRW_OPCODE_SEND);
2573 
2574    assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2575 
2576    brw_set_dest(p, insn, dest);
2577    brw_set_src0(p, insn, src0);
2578    brw_set_src1(p, insn, brw_imm_d(0));
2579 
2580    if (devinfo->gen < 6)
2581       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2582 
2583    brw_set_urb_message(p,
2584 		       insn,
2585 		       flags,
2586 		       msg_length,
2587 		       response_length,
2588 		       offset,
2589 		       swizzle);
2590 }
2591 
2592 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)2593 brw_send_indirect_message(struct brw_codegen *p,
2594                           unsigned sfid,
2595                           struct brw_reg dst,
2596                           struct brw_reg payload,
2597                           struct brw_reg desc,
2598                           unsigned desc_imm,
2599                           bool eot)
2600 {
2601    const struct gen_device_info *devinfo = p->devinfo;
2602    struct brw_inst *send;
2603 
2604    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2605 
2606    assert(desc.type == BRW_REGISTER_TYPE_UD);
2607 
2608    if (desc.file == BRW_IMMEDIATE_VALUE) {
2609       send = next_insn(p, BRW_OPCODE_SEND);
2610       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2611       brw_set_desc(p, send, desc.ud | desc_imm);
2612    } else {
2613       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2614       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2615 
2616       brw_push_insn_state(p);
2617       brw_set_default_access_mode(p, BRW_ALIGN_1);
2618       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2619       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2620       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2621       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2622 
2623       /* Load the indirect descriptor to an address register using OR so the
2624        * caller can specify additional descriptor bits with the desc_imm
2625        * immediate.
2626        */
2627       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2628 
2629       brw_pop_insn_state(p);
2630 
2631       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2632       send = next_insn(p, BRW_OPCODE_SEND);
2633       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2634 
2635       if (devinfo->gen >= 12)
2636          brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2637       else
2638          brw_set_src1(p, send, addr);
2639    }
2640 
2641    brw_set_dest(p, send, dst);
2642    brw_inst_set_sfid(devinfo, send, sfid);
2643    brw_inst_set_eot(devinfo, send, eot);
2644 }
2645 
2646 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool eot)2647 brw_send_indirect_split_message(struct brw_codegen *p,
2648                                 unsigned sfid,
2649                                 struct brw_reg dst,
2650                                 struct brw_reg payload0,
2651                                 struct brw_reg payload1,
2652                                 struct brw_reg desc,
2653                                 unsigned desc_imm,
2654                                 struct brw_reg ex_desc,
2655                                 unsigned ex_desc_imm,
2656                                 bool eot)
2657 {
2658    const struct gen_device_info *devinfo = p->devinfo;
2659    struct brw_inst *send;
2660 
2661    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2662 
2663    assert(desc.type == BRW_REGISTER_TYPE_UD);
2664 
2665    if (desc.file == BRW_IMMEDIATE_VALUE) {
2666       desc.ud |= desc_imm;
2667    } else {
2668       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2669       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2670 
2671       brw_push_insn_state(p);
2672       brw_set_default_access_mode(p, BRW_ALIGN_1);
2673       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2674       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2675       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2676       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2677 
2678       /* Load the indirect descriptor to an address register using OR so the
2679        * caller can specify additional descriptor bits with the desc_imm
2680        * immediate.
2681        */
2682       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2683 
2684       brw_pop_insn_state(p);
2685       desc = addr;
2686 
2687       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2688    }
2689 
2690    if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2691        (devinfo->gen >= 12 || (ex_desc.ud & INTEL_MASK(15, 12)) == 0)) {
2692       ex_desc.ud |= ex_desc_imm;
2693    } else {
2694       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2695       struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2696 
2697       brw_push_insn_state(p);
2698       brw_set_default_access_mode(p, BRW_ALIGN_1);
2699       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2700       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2701       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2702       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2703 
2704       /* Load the indirect extended descriptor to an address register using OR
2705        * so the caller can specify additional descriptor bits with the
2706        * desc_imm immediate.
2707        *
2708        * Even though the instruction dispatcher always pulls the SFID and EOT
2709        * fields from the instruction itself, actual external unit which
2710        * processes the message gets the SFID and EOT from the extended
2711        * descriptor which comes from the address register.  If we don't OR
2712        * those two bits in, the external unit may get confused and hang.
2713        */
2714       unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2715 
2716       if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2717          /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2718           * to Gen12, so we may have fallen back to an indirect extended
2719           * descriptor.
2720           */
2721          brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2722       } else {
2723          brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2724       }
2725 
2726       brw_pop_insn_state(p);
2727       ex_desc = addr;
2728 
2729       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2730    }
2731 
2732    send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2733    brw_set_dest(p, send, dst);
2734    brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2735    brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2736 
2737    if (desc.file == BRW_IMMEDIATE_VALUE) {
2738       brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2739       brw_inst_set_send_desc(devinfo, send, desc.ud);
2740    } else {
2741       assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2742       assert(desc.nr == BRW_ARF_ADDRESS);
2743       assert(desc.subnr == 0);
2744       brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2745    }
2746 
2747    if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2748       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2749       brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2750    } else {
2751       assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2752       assert(ex_desc.nr == BRW_ARF_ADDRESS);
2753       assert((ex_desc.subnr & 0x3) == 0);
2754       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2755       brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2756    }
2757 
2758    brw_inst_set_sfid(devinfo, send, sfid);
2759    brw_inst_set_eot(devinfo, send, eot);
2760 }
2761 
2762 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)2763 brw_send_indirect_surface_message(struct brw_codegen *p,
2764                                   unsigned sfid,
2765                                   struct brw_reg dst,
2766                                   struct brw_reg payload,
2767                                   struct brw_reg surface,
2768                                   unsigned desc_imm)
2769 {
2770    if (surface.file != BRW_IMMEDIATE_VALUE) {
2771       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2772       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2773 
2774       brw_push_insn_state(p);
2775       brw_set_default_access_mode(p, BRW_ALIGN_1);
2776       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2777       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2778       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2779       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2780 
2781       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2782        * some surface array is accessed out of bounds.
2783        */
2784       brw_AND(p, addr,
2785               suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2786                         BRW_GET_SWZ(surface.swizzle, 0)),
2787               brw_imm_ud(0xff));
2788 
2789       brw_pop_insn_state(p);
2790 
2791       surface = addr;
2792       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2793    }
2794 
2795    brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2796 }
2797 
2798 static bool
while_jumps_before_offset(const struct gen_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2799 while_jumps_before_offset(const struct gen_device_info *devinfo,
2800                           brw_inst *insn, int while_offset, int start_offset)
2801 {
2802    int scale = 16 / brw_jump_scale(devinfo);
2803    int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2804                                : brw_inst_jip(devinfo, insn);
2805    assert(jip < 0);
2806    return while_offset + jip * scale <= start_offset;
2807 }
2808 
2809 
2810 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2811 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2812 {
2813    int offset;
2814    void *store = p->store;
2815    const struct gen_device_info *devinfo = p->devinfo;
2816 
2817    int depth = 0;
2818 
2819    for (offset = next_offset(devinfo, store, start_offset);
2820         offset < p->next_insn_offset;
2821         offset = next_offset(devinfo, store, offset)) {
2822       brw_inst *insn = store + offset;
2823 
2824       switch (brw_inst_opcode(devinfo, insn)) {
2825       case BRW_OPCODE_IF:
2826          depth++;
2827          break;
2828       case BRW_OPCODE_ENDIF:
2829          if (depth == 0)
2830             return offset;
2831          depth--;
2832          break;
2833       case BRW_OPCODE_WHILE:
2834          /* If the while doesn't jump before our instruction, it's the end
2835           * of a sibling do...while loop.  Ignore it.
2836           */
2837          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2838             continue;
2839          /* fallthrough */
2840       case BRW_OPCODE_ELSE:
2841       case BRW_OPCODE_HALT:
2842          if (depth == 0)
2843             return offset;
2844       default:
2845          break;
2846       }
2847    }
2848 
2849    return 0;
2850 }
2851 
2852 /* There is no DO instruction on gen6, so to find the end of the loop
2853  * we have to see if the loop is jumping back before our start
2854  * instruction.
2855  */
2856 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2857 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2858 {
2859    const struct gen_device_info *devinfo = p->devinfo;
2860    int offset;
2861    void *store = p->store;
2862 
2863    assert(devinfo->gen >= 6);
2864 
2865    /* Always start after the instruction (such as a WHILE) we're trying to fix
2866     * up.
2867     */
2868    for (offset = next_offset(devinfo, store, start_offset);
2869         offset < p->next_insn_offset;
2870         offset = next_offset(devinfo, store, offset)) {
2871       brw_inst *insn = store + offset;
2872 
2873       if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2874 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2875 	    return offset;
2876       }
2877    }
2878    assert(!"not reached");
2879    return start_offset;
2880 }
2881 
2882 /* After program generation, go back and update the UIP and JIP of
2883  * BREAK, CONT, and HALT instructions to their correct locations.
2884  */
2885 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2886 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2887 {
2888    const struct gen_device_info *devinfo = p->devinfo;
2889    int offset;
2890    int br = brw_jump_scale(devinfo);
2891    int scale = 16 / br;
2892    void *store = p->store;
2893 
2894    if (devinfo->gen < 6)
2895       return;
2896 
2897    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2898       brw_inst *insn = store + offset;
2899       assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2900 
2901       int block_end_offset = brw_find_next_block_end(p, offset);
2902       switch (brw_inst_opcode(devinfo, insn)) {
2903       case BRW_OPCODE_BREAK:
2904          assert(block_end_offset != 0);
2905          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2906 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2907          brw_inst_set_uip(devinfo, insn,
2908 	    (brw_find_loop_end(p, offset) - offset +
2909              (devinfo->gen == 6 ? 16 : 0)) / scale);
2910 	 break;
2911       case BRW_OPCODE_CONTINUE:
2912          assert(block_end_offset != 0);
2913          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2914          brw_inst_set_uip(devinfo, insn,
2915             (brw_find_loop_end(p, offset) - offset) / scale);
2916 
2917          assert(brw_inst_uip(devinfo, insn) != 0);
2918          assert(brw_inst_jip(devinfo, insn) != 0);
2919 	 break;
2920 
2921       case BRW_OPCODE_ENDIF: {
2922          int32_t jump = (block_end_offset == 0) ?
2923                         1 * br : (block_end_offset - offset) / scale;
2924          if (devinfo->gen >= 7)
2925             brw_inst_set_jip(devinfo, insn, jump);
2926          else
2927             brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2928 	 break;
2929       }
2930 
2931       case BRW_OPCODE_HALT:
2932 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2933 	  *
2934 	  *    "In case of the halt instruction not inside any conditional
2935 	  *     code block, the value of <JIP> and <UIP> should be the
2936 	  *     same. In case of the halt instruction inside conditional code
2937 	  *     block, the <UIP> should be the end of the program, and the
2938 	  *     <JIP> should be end of the most inner conditional code block."
2939 	  *
2940 	  * The uip will have already been set by whoever set up the
2941 	  * instruction.
2942 	  */
2943 	 if (block_end_offset == 0) {
2944             brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2945 	 } else {
2946             brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2947 	 }
2948          assert(brw_inst_uip(devinfo, insn) != 0);
2949          assert(brw_inst_jip(devinfo, insn) != 0);
2950 	 break;
2951 
2952       default:
2953          break;
2954       }
2955    }
2956 }
2957 
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)2958 void brw_ff_sync(struct brw_codegen *p,
2959 		   struct brw_reg dest,
2960 		   unsigned msg_reg_nr,
2961 		   struct brw_reg src0,
2962 		   bool allocate,
2963 		   unsigned response_length,
2964 		   bool eot)
2965 {
2966    const struct gen_device_info *devinfo = p->devinfo;
2967    brw_inst *insn;
2968 
2969    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2970 
2971    insn = next_insn(p, BRW_OPCODE_SEND);
2972    brw_set_dest(p, insn, dest);
2973    brw_set_src0(p, insn, src0);
2974    brw_set_src1(p, insn, brw_imm_d(0));
2975 
2976    if (devinfo->gen < 6)
2977       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2978 
2979    brw_set_ff_sync_message(p,
2980 			   insn,
2981 			   allocate,
2982 			   response_length,
2983 			   eot);
2984 }
2985 
2986 /**
2987  * Emit the SEND instruction necessary to generate stream output data on Gen6
2988  * (for transform feedback).
2989  *
2990  * If send_commit_msg is true, this is the last piece of stream output data
2991  * from this thread, so send the data as a committed write.  According to the
2992  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2993  *
2994  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2995  *   writes are complete by sending the final write as a committed write."
2996  */
2997 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)2998 brw_svb_write(struct brw_codegen *p,
2999               struct brw_reg dest,
3000               unsigned msg_reg_nr,
3001               struct brw_reg src0,
3002               unsigned binding_table_index,
3003               bool   send_commit_msg)
3004 {
3005    const struct gen_device_info *devinfo = p->devinfo;
3006    const unsigned target_cache =
3007       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
3008        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
3009        BRW_SFID_DATAPORT_WRITE);
3010    brw_inst *insn;
3011 
3012    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3013 
3014    insn = next_insn(p, BRW_OPCODE_SEND);
3015    brw_inst_set_sfid(devinfo, insn, target_cache);
3016    brw_set_dest(p, insn, dest);
3017    brw_set_src0(p, insn, src0);
3018    brw_set_desc(p, insn,
3019                 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3020                 brw_dp_write_desc(devinfo, binding_table_index,
3021                                   0, /* msg_control: ignored */
3022                                   GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3023                                   0, /* last_render_target: ignored */
3024                                   send_commit_msg)); /* send_commit_msg */
3025 }
3026 
3027 static unsigned
brw_surface_payload_size(struct brw_codegen * p,unsigned num_channels,unsigned exec_size)3028 brw_surface_payload_size(struct brw_codegen *p,
3029                          unsigned num_channels,
3030                          unsigned exec_size /**< 0 for SIMD4x2 */)
3031 {
3032    if (exec_size == 0)
3033       return 1; /* SIMD4x2 */
3034    else if (exec_size <= 8)
3035       return num_channels;
3036    else
3037       return 2 * num_channels;
3038 }
3039 
3040 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3041 brw_untyped_atomic(struct brw_codegen *p,
3042                    struct brw_reg dst,
3043                    struct brw_reg payload,
3044                    struct brw_reg surface,
3045                    unsigned atomic_op,
3046                    unsigned msg_length,
3047                    bool response_expected,
3048                    bool header_present)
3049 {
3050    const struct gen_device_info *devinfo = p->devinfo;
3051    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3052                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3053                           GEN7_SFID_DATAPORT_DATA_CACHE);
3054    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3055    /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3056    const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3057    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3058                               has_simd4x2 ? 0 : 8;
3059    const unsigned response_length =
3060       brw_surface_payload_size(p, response_expected, exec_size);
3061    const unsigned desc =
3062       brw_message_desc(devinfo, msg_length, response_length, header_present) |
3063       brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3064                                  response_expected);
3065    /* Mask out unused components -- This is especially important in Align16
3066     * mode on generations that don't have native support for SIMD4x2 atomics,
3067     * because unused but enabled components will cause the dataport to perform
3068     * additional atomic operations on the addresses that happen to be in the
3069     * uninitialized Y, Z and W coordinates of the payload.
3070     */
3071    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3072 
3073    brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3074                                      payload, surface, desc);
3075 }
3076 
3077 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3078 brw_untyped_surface_read(struct brw_codegen *p,
3079                          struct brw_reg dst,
3080                          struct brw_reg payload,
3081                          struct brw_reg surface,
3082                          unsigned msg_length,
3083                          unsigned num_channels)
3084 {
3085    const struct gen_device_info *devinfo = p->devinfo;
3086    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3087                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3088                           GEN7_SFID_DATAPORT_DATA_CACHE);
3089    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3090    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3091    const unsigned response_length =
3092       brw_surface_payload_size(p, num_channels, exec_size);
3093    const unsigned desc =
3094       brw_message_desc(devinfo, msg_length, response_length, false) |
3095       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3096 
3097    brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3098 }
3099 
3100 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3101 brw_untyped_surface_write(struct brw_codegen *p,
3102                           struct brw_reg payload,
3103                           struct brw_reg surface,
3104                           unsigned msg_length,
3105                           unsigned num_channels,
3106                           bool header_present)
3107 {
3108    const struct gen_device_info *devinfo = p->devinfo;
3109    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3110                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3111                           GEN7_SFID_DATAPORT_DATA_CACHE);
3112    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3113    /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3114    const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3115    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3116                               has_simd4x2 ? 0 : 8;
3117    const unsigned desc =
3118       brw_message_desc(devinfo, msg_length, 0, header_present) |
3119       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3120    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3121    const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3122 
3123    brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3124                                      payload, surface, desc);
3125 }
3126 
3127 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)3128 brw_set_memory_fence_message(struct brw_codegen *p,
3129                              struct brw_inst *insn,
3130                              enum brw_message_target sfid,
3131                              bool commit_enable,
3132                              unsigned bti)
3133 {
3134    const struct gen_device_info *devinfo = p->devinfo;
3135 
3136    brw_set_desc(p, insn, brw_message_desc(
3137                    devinfo, 1, (commit_enable ? 1 : 0), true));
3138 
3139    brw_inst_set_sfid(devinfo, insn, sfid);
3140 
3141    switch (sfid) {
3142    case GEN6_SFID_DATAPORT_RENDER_CACHE:
3143       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3144       break;
3145    case GEN7_SFID_DATAPORT_DATA_CACHE:
3146       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3147       break;
3148    default:
3149       unreachable("Not reached");
3150    }
3151 
3152    if (commit_enable)
3153       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3154 
3155    assert(devinfo->gen >= 11 || bti == 0);
3156    brw_inst_set_binding_table_index(devinfo, insn, bti);
3157 }
3158 
3159 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,bool commit_enable,unsigned bti)3160 brw_memory_fence(struct brw_codegen *p,
3161                  struct brw_reg dst,
3162                  struct brw_reg src,
3163                  enum opcode send_op,
3164                  enum brw_message_target sfid,
3165                  bool commit_enable,
3166                  unsigned bti)
3167 {
3168    const struct gen_device_info *devinfo = p->devinfo;
3169 
3170    dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3171    src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3172 
3173    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3174     * message doesn't write anything back.
3175     */
3176    struct brw_inst *insn = next_insn(p, send_op);
3177    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3178    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3179    brw_set_dest(p, insn, dst);
3180    brw_set_src0(p, insn, src);
3181    brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3182 }
3183 
3184 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3185 brw_pixel_interpolator_query(struct brw_codegen *p,
3186                              struct brw_reg dest,
3187                              struct brw_reg mrf,
3188                              bool noperspective,
3189                              unsigned mode,
3190                              struct brw_reg data,
3191                              unsigned msg_length,
3192                              unsigned response_length)
3193 {
3194    const struct gen_device_info *devinfo = p->devinfo;
3195    const uint16_t exec_size = brw_get_default_exec_size(p);
3196    const unsigned slot_group = brw_get_default_group(p) / 16;
3197    const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3198    const unsigned desc =
3199       brw_message_desc(devinfo, msg_length, response_length, false) |
3200       brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3201                             slot_group);
3202 
3203    /* brw_send_indirect_message will automatically use a direct send message
3204     * if data is actually immediate.
3205     */
3206    brw_send_indirect_message(p,
3207                              GEN7_SFID_PIXEL_INTERPOLATOR,
3208                              dest,
3209                              mrf,
3210                              vec1(data),
3211                              desc,
3212                              false);
3213 }
3214 
3215 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,struct brw_reg mask)3216 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3217                       struct brw_reg mask)
3218 {
3219    const struct gen_device_info *devinfo = p->devinfo;
3220    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3221    const unsigned qtr_control = brw_get_default_group(p) / 8;
3222    brw_inst *inst;
3223 
3224    assert(devinfo->gen >= 7);
3225    assert(mask.type == BRW_REGISTER_TYPE_UD);
3226 
3227    brw_push_insn_state(p);
3228 
3229    /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3230     * unnecessary bits in the instruction words, get the information we need
3231     * and reset the default flag register. This allows more instructions to be
3232     * compacted.
3233     */
3234    const unsigned flag_subreg = p->current->flag_subreg;
3235    brw_set_default_flag_reg(p, 0, 0);
3236 
3237    if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3238       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3239 
3240       if (devinfo->gen >= 8) {
3241          /* Getting the first active channel index is easy on Gen8: Just find
3242           * the first bit set in the execution mask.  The register exists on
3243           * HSW already but it reads back as all ones when the current
3244           * instruction has execution masking disabled, so it's kind of
3245           * useless.
3246           */
3247          struct brw_reg exec_mask =
3248             retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3249 
3250          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3251          if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3252             /* Unfortunately, ce0 does not take into account the thread
3253              * dispatch mask, which may be a problem in cases where it's not
3254              * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3255              * some n).  Combine ce0 with the given dispatch (or vector) mask
3256              * to mask off those channels which were never dispatched by the
3257              * hardware.
3258              */
3259             brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3260             brw_set_default_swsb(p, tgl_swsb_regdist(1));
3261             brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3262             exec_mask = vec1(dst);
3263          }
3264 
3265          /* Quarter control has the effect of magically shifting the value of
3266           * ce0 so you'll get the first active channel relative to the
3267           * specified quarter control as result.
3268           */
3269          inst = brw_FBL(p, vec1(dst), exec_mask);
3270       } else {
3271          const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3272 
3273          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3274          brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3275 
3276          /* Run enough instructions returning zero with execution masking and
3277           * a conditional modifier enabled in order to get the full execution
3278           * mask in f1.0.  We could use a single 32-wide move here if it
3279           * weren't because of the hardware bug that causes channel enables to
3280           * be applied incorrectly to the second half of 32-wide instructions
3281           * on Gen7.
3282           */
3283          const unsigned lower_size = MIN2(16, exec_size);
3284          for (unsigned i = 0; i < exec_size / lower_size; i++) {
3285             inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3286                            brw_imm_uw(0));
3287             brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3288             brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3289             brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3290             brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3291             brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3292             brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3293          }
3294 
3295          /* Find the first bit set in the exec_size-wide portion of the flag
3296           * register that was updated by the last sequence of MOV
3297           * instructions.
3298           */
3299          const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3300          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3301          brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3302       }
3303    } else {
3304       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3305 
3306       if (devinfo->gen >= 8 &&
3307           mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3308          /* In SIMD4x2 mode the first active channel index is just the
3309           * negation of the first bit of the mask register.  Note that ce0
3310           * doesn't take into account the dispatch mask, so the Gen7 path
3311           * should be used instead unless you have the guarantee that the
3312           * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3313           * for some n).
3314           */
3315          inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3316                         negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3317                         brw_imm_ud(1));
3318 
3319       } else {
3320          /* Overwrite the destination without and with execution masking to
3321           * find out which of the channels is active.
3322           */
3323          brw_push_insn_state(p);
3324          brw_set_default_exec_size(p, BRW_EXECUTE_4);
3325          brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3326                  brw_imm_ud(1));
3327 
3328          inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3329                         brw_imm_ud(0));
3330          brw_pop_insn_state(p);
3331          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3332       }
3333    }
3334 
3335    brw_pop_insn_state(p);
3336 }
3337 
3338 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3339 brw_broadcast(struct brw_codegen *p,
3340               struct brw_reg dst,
3341               struct brw_reg src,
3342               struct brw_reg idx)
3343 {
3344    const struct gen_device_info *devinfo = p->devinfo;
3345    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3346    brw_inst *inst;
3347 
3348    brw_push_insn_state(p);
3349    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3350    brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3351 
3352    assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3353           src.address_mode == BRW_ADDRESS_DIRECT);
3354    assert(!src.abs && !src.negate);
3355    assert(src.type == dst.type);
3356 
3357    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3358        idx.file == BRW_IMMEDIATE_VALUE) {
3359       /* Trivial, the source is already uniform or the index is a constant.
3360        * We will typically not get here if the optimizer is doing its job, but
3361        * asserting would be mean.
3362        */
3363       const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3364       src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3365                      stride(suboffset(src, 4 * i), 0, 4, 1);
3366 
3367       if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3368          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3369                     subscript(src, BRW_REGISTER_TYPE_D, 0));
3370          brw_set_default_swsb(p, tgl_swsb_null());
3371          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3372                     subscript(src, BRW_REGISTER_TYPE_D, 1));
3373       } else {
3374          brw_MOV(p, dst, src);
3375       }
3376    } else {
3377       /* From the Haswell PRM section "Register Region Restrictions":
3378        *
3379        *    "The lower bits of the AddressImmediate must not overflow to
3380        *    change the register address.  The lower 5 bits of Address
3381        *    Immediate when added to lower 5 bits of address register gives
3382        *    the sub-register offset. The upper bits of Address Immediate
3383        *    when added to upper bits of address register gives the register
3384        *    address. Any overflow from sub-register offset is dropped."
3385        *
3386        * Fortunately, for broadcast, we never have a sub-register offset so
3387        * this isn't an issue.
3388        */
3389       assert(src.subnr == 0);
3390 
3391       if (align1) {
3392          const struct brw_reg addr =
3393             retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3394          unsigned offset = src.nr * REG_SIZE + src.subnr;
3395          /* Limit in bytes of the signed indirect addressing immediate. */
3396          const unsigned limit = 512;
3397 
3398          brw_push_insn_state(p);
3399          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3400          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3401 
3402          /* Take into account the component size and horizontal stride. */
3403          assert(src.vstride == src.hstride + src.width);
3404          brw_SHL(p, addr, vec1(idx),
3405                  brw_imm_ud(util_logbase2(type_sz(src.type)) +
3406                             src.hstride - 1));
3407 
3408          /* We can only address up to limit bytes using the indirect
3409           * addressing immediate, account for the difference if the source
3410           * register is above this limit.
3411           */
3412          if (offset >= limit) {
3413             brw_set_default_swsb(p, tgl_swsb_regdist(1));
3414             brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3415             offset = offset % limit;
3416          }
3417 
3418          brw_pop_insn_state(p);
3419 
3420          brw_set_default_swsb(p, tgl_swsb_regdist(1));
3421 
3422          /* Use indirect addressing to fetch the specified component. */
3423          if (type_sz(src.type) > 4 &&
3424              (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) ||
3425               !devinfo->has_64bit_float)) {
3426             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3427              *
3428              *    "When source or destination datatype is 64b or operation is
3429              *    integer DWord multiply, indirect addressing must not be
3430              *    used."
3431              *
3432              * To work around both of this issue, we do two integer MOVs
3433              * insead of one 64-bit MOV.  Because no double value should ever
3434              * cross a register boundary, it's safe to use the immediate
3435              * offset in the indirect here to handle adding 4 bytes to the
3436              * offset and avoid the extra ADD to the register file.
3437              */
3438             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3439                        retype(brw_vec1_indirect(addr.subnr, offset),
3440                               BRW_REGISTER_TYPE_D));
3441             brw_set_default_swsb(p, tgl_swsb_null());
3442             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3443                        retype(brw_vec1_indirect(addr.subnr, offset + 4),
3444                               BRW_REGISTER_TYPE_D));
3445          } else {
3446             brw_MOV(p, dst,
3447                     retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3448          }
3449       } else {
3450          /* In SIMD4x2 mode the index can be either zero or one, replicate it
3451           * to all bits of a flag register,
3452           */
3453          inst = brw_MOV(p,
3454                         brw_null_reg(),
3455                         stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3456          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3457          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3458          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3459 
3460          /* and use predicated SEL to pick the right channel. */
3461          inst = brw_SEL(p, dst,
3462                         stride(suboffset(src, 4), 4, 4, 1),
3463                         stride(src, 4, 4, 1));
3464          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3465          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3466       }
3467    }
3468 
3469    brw_pop_insn_state(p);
3470 }
3471 
3472 /**
3473  * This instruction is generated as a single-channel align1 instruction by
3474  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3475  *
3476  * We can't use the typed atomic op in the FS because that has the execution
3477  * mask ANDed with the pixel mask, but we just want to write the one dword for
3478  * all the pixels.
3479  *
3480  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3481  * one u32.  So we use the same untyped atomic write message as the pixel
3482  * shader.
3483  *
3484  * The untyped atomic operation requires a BUFFER surface type with RAW
3485  * format, and is only accessible through the legacy DATA_CACHE dataport
3486  * messages.
3487  */
brw_shader_time_add(struct brw_codegen * p,struct brw_reg payload,uint32_t surf_index)3488 void brw_shader_time_add(struct brw_codegen *p,
3489                          struct brw_reg payload,
3490                          uint32_t surf_index)
3491 {
3492    const struct gen_device_info *devinfo = p->devinfo;
3493    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3494                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3495                           GEN7_SFID_DATAPORT_DATA_CACHE);
3496    assert(devinfo->gen >= 7);
3497 
3498    brw_push_insn_state(p);
3499    brw_set_default_access_mode(p, BRW_ALIGN_1);
3500    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3501    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3502    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3503 
3504    /* We use brw_vec1_reg and unmasked because we want to increment the given
3505     * offset only once.
3506     */
3507    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3508                                       BRW_ARF_NULL, 0));
3509    brw_set_src0(p, send, brw_vec1_reg(payload.file,
3510                                       payload.nr, 0));
3511    brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3512                           brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3513                                                      false)));
3514 
3515    brw_inst_set_sfid(devinfo, send, sfid);
3516    brw_inst_set_binding_table_index(devinfo, send, surf_index);
3517 
3518    brw_pop_insn_state(p);
3519 }
3520 
3521 
3522 /**
3523  * Emit the SEND message for a barrier
3524  */
3525 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3526 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3527 {
3528    const struct gen_device_info *devinfo = p->devinfo;
3529    struct brw_inst *inst;
3530 
3531    assert(devinfo->gen >= 7);
3532 
3533    brw_push_insn_state(p);
3534    brw_set_default_access_mode(p, BRW_ALIGN_1);
3535    inst = next_insn(p, BRW_OPCODE_SEND);
3536    brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3537    brw_set_src0(p, inst, src);
3538    brw_set_src1(p, inst, brw_null_reg());
3539    brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3540 
3541    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3542    brw_inst_set_gateway_subfuncid(devinfo, inst,
3543                                   BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3544 
3545    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3546    brw_pop_insn_state(p);
3547 }
3548 
3549 
3550 /**
3551  * Emit the wait instruction for a barrier
3552  */
3553 void
brw_WAIT(struct brw_codegen * p)3554 brw_WAIT(struct brw_codegen *p)
3555 {
3556    const struct gen_device_info *devinfo = p->devinfo;
3557    struct brw_inst *insn;
3558 
3559    struct brw_reg src = brw_notification_reg();
3560 
3561    insn = next_insn(p, BRW_OPCODE_WAIT);
3562    brw_set_dest(p, insn, src);
3563    brw_set_src0(p, insn, src);
3564    brw_set_src1(p, insn, brw_null_reg());
3565 
3566    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3567    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3568 }
3569 
3570 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)3571 brw_float_controls_mode(struct brw_codegen *p,
3572                         unsigned mode, unsigned mask)
3573 {
3574    /* From the Skylake PRM, Volume 7, page 760:
3575     *  "Implementation Restriction on Register Access: When the control
3576     *   register is used as an explicit source and/or destination, hardware
3577     *   does not ensure execution pipeline coherency. Software must set the
3578     *   thread control field to ‘switch’ for an instruction that uses
3579     *   control register as an explicit operand."
3580     *
3581     * On Gen12+ this is implemented in terms of SWSB annotations instead.
3582     */
3583    brw_set_default_swsb(p, tgl_swsb_regdist(1));
3584 
3585    brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3586                             brw_imm_ud(~mask));
3587    brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3588    if (p->devinfo->gen < 12)
3589       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3590 
3591    if (mode) {
3592       brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3593                                  brw_imm_ud(mode));
3594       brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3595       if (p->devinfo->gen < 12)
3596          brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3597    }
3598 
3599    if (p->devinfo->gen >= 12)
3600       brw_SYNC(p, TGL_SYNC_NOP);
3601 }
3602