1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35 
36 #include "util/ralloc.h"
37 
38 /**
39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40  * registers, implicitly moving the operand to a message register.
41  *
42  * On Sandybridge, this is no longer the case.  This function performs the
43  * explicit move; it should be called before emitting a SEND instruction.
44  */
45 void
gfx6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gfx6_resolve_implied_move(struct brw_codegen *p,
47 			  struct brw_reg *src,
48 			  unsigned msg_reg_nr)
49 {
50    const struct intel_device_info *devinfo = p->devinfo;
51    if (devinfo->ver < 6)
52       return;
53 
54    if (src->file == BRW_MESSAGE_REGISTER_FILE)
55       return;
56 
57    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58       assert(devinfo->ver < 12);
59       brw_push_insn_state(p);
60       brw_set_default_exec_size(p, BRW_EXECUTE_8);
61       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 	      retype(*src, BRW_REGISTER_TYPE_UD));
65       brw_pop_insn_state(p);
66    }
67    *src = brw_message_reg(msg_reg_nr);
68 }
69 
70 static void
gfx7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)71 gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74     * "The send with EOT should use register space R112-R127 for <src>. This is
75     *  to enable loading of a new thread into the same slot while the message
76     *  with EOT for current thread is pending dispatch."
77     *
78     * Since we're pretending to have 16 MRFs anyway, we may as well use the
79     * registers required for messages with EOT.
80     */
81    const struct intel_device_info *devinfo = p->devinfo;
82    if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83       reg->file = BRW_GENERAL_REGISTER_FILE;
84       reg->nr += GFX7_MRF_HACK_START;
85    }
86 }
87 
88 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91    const struct intel_device_info *devinfo = p->devinfo;
92 
93    if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94       assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
95    else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96       assert(dest.nr < 128);
97 
98    /* The hardware has a restriction where a destination of size Byte with
99     * a stride of 1 is only allowed for a packed byte MOV. For any other
100     * instruction, the stride must be at least 2, even when the destination
101     * is the NULL register.
102     */
103    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104        dest.nr == BRW_ARF_NULL &&
105        type_sz(dest.type) == 1 &&
106        dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107       dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108    }
109 
110    gfx7_convert_mrf_to_grf(p, &dest);
111 
112    if (devinfo->ver >= 12 &&
113        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118       assert(dest.subnr == 0);
119       assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120              (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121               dest.vstride == dest.width + 1));
122       assert(!dest.negate && !dest.abs);
123       brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125 
126    } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127               brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128       assert(devinfo->ver < 12);
129       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132       assert(dest.subnr % 16 == 0);
133       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134              dest.vstride == dest.width + 1);
135       assert(!dest.negate && !dest.abs);
136       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137       brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138       brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139    } else {
140       brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141       brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142 
143       if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144          brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145 
146          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147             brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151          } else {
152             brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153             brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154             if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155                 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156                assert(dest.writemask != 0);
157             }
158             /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159              *    Although Dst.HorzStride is a don't care for Align16, HW needs
160              *    this to be programmed as "01".
161              */
162             brw_inst_set_dst_hstride(devinfo, inst, 1);
163          }
164       } else {
165          brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166 
167          /* These are different sizes in align1 vs align16:
168           */
169          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170             brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171                                           dest.indirect_offset);
172             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175          } else {
176             brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177                                            dest.indirect_offset);
178             /* even ignored in da16, still need to set as '01' */
179             brw_inst_set_dst_hstride(devinfo, inst, 1);
180          }
181       }
182    }
183 
184    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
186     * small registers, it can be useful for us to automatically reduce it to
187     * match the register size.
188     */
189    if (p->automatic_exec_sizes) {
190       /*
191        * In platforms that support fp64 we can emit instructions with a width
192        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193        * these cases we need to make sure that these instructions have their
194        * exec sizes set properly when they are emitted and we can't rely on
195        * this code to fix it.
196        */
197       bool fix_exec_size;
198       if (devinfo->ver >= 6)
199          fix_exec_size = dest.width < BRW_EXECUTE_4;
200       else
201          fix_exec_size = dest.width < BRW_EXECUTE_8;
202 
203       if (fix_exec_size)
204          brw_inst_set_exec_size(devinfo, inst, dest.width);
205    }
206 }
207 
208 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211    const struct intel_device_info *devinfo = p->devinfo;
212 
213    if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214       assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
215    else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216       assert(reg.nr < 128);
217 
218    gfx7_convert_mrf_to_grf(p, &reg);
219 
220    if (devinfo->ver >= 6 &&
221        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225       /* Any source modifiers or regions will be ignored, since this just
226        * identifies the MRF/GRF to start reading the message contents from.
227        * Check for some likely failures.
228        */
229       assert(!reg.negate);
230       assert(!reg.abs);
231       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232    }
233 
234    if (devinfo->ver >= 12 &&
235        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237       assert(reg.file != BRW_IMMEDIATE_VALUE);
238       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239       assert(reg.subnr == 0);
240       assert(has_scalar_region(reg) ||
241              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242               reg.vstride == reg.width + 1));
243       assert(!reg.negate && !reg.abs);
244       brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246 
247    } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248               brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249       assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251       assert(reg.subnr % 16 == 0);
252       assert(has_scalar_region(reg) ||
253              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254               reg.vstride == reg.width + 1));
255       assert(!reg.negate && !reg.abs);
256       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257       brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258    } else {
259       brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260       brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261       brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262       brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263 
264       if (reg.file == BRW_IMMEDIATE_VALUE) {
265          if (reg.type == BRW_REGISTER_TYPE_DF ||
266              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
267             brw_inst_set_imm_df(devinfo, inst, reg.df);
268          else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269                   reg.type == BRW_REGISTER_TYPE_Q)
270             brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271          else
272             brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273 
274          if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275             brw_inst_set_src1_reg_file(devinfo, inst,
276                                        BRW_ARCHITECTURE_REGISTER_FILE);
277             brw_inst_set_src1_reg_hw_type(devinfo, inst,
278                                           brw_inst_src0_reg_hw_type(devinfo, inst));
279          }
280       } else {
281          if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282             brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284                 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285             } else {
286                brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287             }
288          } else {
289             brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290 
291             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292                brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293             } else {
294                brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295             }
296          }
297 
298          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299             if (reg.width == BRW_WIDTH_1 &&
300                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301                brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302                brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304             } else {
305                brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306                brw_inst_set_src0_width(devinfo, inst, reg.width);
307                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308             }
309          } else {
310             brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312             brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314             brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316             brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318 
319             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320                /* This is an oddity of the fact we're using the same
321                 * descriptions for registers in align_16 as align_1:
322                 */
323                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324             } else if (devinfo->verx10 == 70 &&
325                        reg.type == BRW_REGISTER_TYPE_DF &&
326                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
327                /* From SNB PRM:
328                 *
329                 * "For Align16 access mode, only encodings of 0000 and 0011
330                 *  are allowed. Other codes are reserved."
331                 *
332                 * Presumably the DevSNB behavior applies to IVB as well.
333                 */
334                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335             } else {
336                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337             }
338          }
339       }
340    }
341 }
342 
343 
344 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)345 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346 {
347    const struct intel_device_info *devinfo = p->devinfo;
348 
349    if (reg.file == BRW_GENERAL_REGISTER_FILE)
350       assert(reg.nr < 128);
351 
352    if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
353        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
354        (devinfo->ver >= 12 &&
355         (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
356          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
357       assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358              reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360       assert(reg.subnr == 0);
361       assert(has_scalar_region(reg) ||
362              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363               reg.vstride == reg.width + 1));
364       assert(!reg.negate && !reg.abs);
365       brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366       brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367    } else {
368       /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369        *
370        *    "Accumulator registers may be accessed explicitly as src0
371        *    operands only."
372        */
373       assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374              reg.nr != BRW_ARF_ACCUMULATOR);
375 
376       gfx7_convert_mrf_to_grf(p, &reg);
377       assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378 
379       brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380       brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381       brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382 
383       /* Only src1 can be immediate in two-argument instructions.
384        */
385       assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386 
387       if (reg.file == BRW_IMMEDIATE_VALUE) {
388          /* two-argument instructions can only use 32-bit immediates */
389          assert(type_sz(reg.type) < 8);
390          brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391       } else {
392          /* This is a hardware restriction, which may or may not be lifted
393           * in the future:
394           */
395          assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396          /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397 
398          brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400             brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401          } else {
402             brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403          }
404 
405          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406             if (reg.width == BRW_WIDTH_1 &&
407                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408                brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409                brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411             } else {
412                brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413                brw_inst_set_src1_width(devinfo, inst, reg.width);
414                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415             }
416          } else {
417             brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419             brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421             brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423             brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425 
426             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427                /* This is an oddity of the fact we're using the same
428                 * descriptions for registers in align_16 as align_1:
429                 */
430                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431             } else if (devinfo->verx10 == 70 &&
432                        reg.type == BRW_REGISTER_TYPE_DF &&
433                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
434                /* From SNB PRM:
435                 *
436                 * "For Align16 access mode, only encodings of 0000 and 0011
437                 *  are allowed. Other codes are reserved."
438                 *
439                 * Presumably the DevSNB behavior applies to IVB as well.
440                 */
441                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442             } else {
443                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444             }
445          }
446       }
447    }
448 }
449 
450 /**
451  * Specify the descriptor and extended descriptor immediate for a SEND(C)
452  * message instruction.
453  */
454 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)455 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456                 unsigned desc, unsigned ex_desc)
457 {
458    const struct intel_device_info *devinfo = p->devinfo;
459    assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
460           brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
461    if (devinfo->ver < 12)
462       brw_inst_set_src1_file_type(devinfo, inst,
463                                   BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464    brw_inst_set_send_desc(devinfo, inst, desc);
465    if (devinfo->ver >= 9)
466       brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468 
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void brw_set_math_message( struct brw_codegen *p,
470 				  brw_inst *inst,
471 				  unsigned function,
472 				  unsigned integer_type,
473 				  bool low_precision,
474 				  unsigned dataType )
475 {
476    const struct intel_device_info *devinfo = p->devinfo;
477    unsigned msg_length;
478    unsigned response_length;
479 
480    /* Infer message length from the function */
481    switch (function) {
482    case BRW_MATH_FUNCTION_POW:
483    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486       msg_length = 2;
487       break;
488    default:
489       msg_length = 1;
490       break;
491    }
492 
493    /* Infer response length from the function */
494    switch (function) {
495    case BRW_MATH_FUNCTION_SINCOS:
496    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497       response_length = 2;
498       break;
499    default:
500       response_length = 1;
501       break;
502    }
503 
504    brw_set_desc(p, inst, brw_message_desc(
505                    devinfo, msg_length, response_length, false));
506 
507    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508    brw_inst_set_math_msg_function(devinfo, inst, function);
509    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511    brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512    brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513    brw_inst_set_saturate(devinfo, inst, 0);
514 }
515 
516 
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void brw_set_ff_sync_message(struct brw_codegen *p,
518 				    brw_inst *insn,
519 				    bool allocate,
520 				    unsigned response_length,
521 				    bool end_of_thread)
522 {
523    const struct intel_device_info *devinfo = p->devinfo;
524 
525    brw_set_desc(p, insn, brw_message_desc(
526                    devinfo, 1, response_length, true));
527 
528    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529    brw_inst_set_eot(devinfo, insn, end_of_thread);
530    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531    brw_inst_set_urb_allocate(devinfo, insn, allocate);
532    /* The following fields are not used by FF_SYNC: */
533    brw_inst_set_urb_global_offset(devinfo, insn, 0);
534    brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535    brw_inst_set_urb_used(devinfo, insn, 0);
536    brw_inst_set_urb_complete(devinfo, insn, 0);
537 }
538 
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void brw_set_urb_message( struct brw_codegen *p,
540 				 brw_inst *insn,
541                                  enum brw_urb_write_flags flags,
542 				 unsigned msg_length,
543 				 unsigned response_length,
544 				 unsigned offset,
545 				 unsigned swizzle_control )
546 {
547    const struct intel_device_info *devinfo = p->devinfo;
548 
549    assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550    assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551    assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552 
553    brw_set_desc(p, insn, brw_message_desc(
554                    devinfo, msg_length, response_length, true));
555 
556    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557    brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558 
559    if (flags & BRW_URB_WRITE_OWORD) {
560       assert(msg_length == 2); /* header + one OWORD of data */
561       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562    } else {
563       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564    }
565 
566    brw_inst_set_urb_global_offset(devinfo, insn, offset);
567    brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568 
569    if (devinfo->ver < 8) {
570       brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571    }
572 
573    if (devinfo->ver < 7) {
574       brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575       brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576    } else {
577       brw_inst_set_urb_per_slot_offset(devinfo, insn,
578          !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579    }
580 }
581 
582 static void
gfx7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gfx7_set_dp_scratch_message(struct brw_codegen *p,
584                             brw_inst *inst,
585                             bool write,
586                             bool dword,
587                             bool invalidate_after_read,
588                             unsigned num_regs,
589                             unsigned addr_offset,
590                             unsigned mlen,
591                             unsigned rlen,
592                             bool header_present)
593 {
594    const struct intel_device_info *devinfo = p->devinfo;
595    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596           (devinfo->ver >= 8 && num_regs == 8));
597    const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598                                 num_regs - 1);
599 
600    brw_set_desc(p, inst, brw_message_desc(
601                    devinfo, mlen, rlen, header_present));
602 
603    brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605    brw_inst_set_scratch_read_write(devinfo, inst, write);
606    brw_inst_set_scratch_type(devinfo, inst, dword);
607    brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608    brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611 
612 static void
brw_inst_set_state(const struct intel_device_info * devinfo,brw_inst * insn,const struct brw_insn_state * state)613 brw_inst_set_state(const struct intel_device_info *devinfo,
614                    brw_inst *insn,
615                    const struct brw_insn_state *state)
616 {
617    brw_inst_set_exec_size(devinfo, insn, state->exec_size);
618    brw_inst_set_group(devinfo, insn, state->group);
619    brw_inst_set_compression(devinfo, insn, state->compressed);
620    brw_inst_set_access_mode(devinfo, insn, state->access_mode);
621    brw_inst_set_mask_control(devinfo, insn, state->mask_control);
622    if (devinfo->ver >= 12)
623       brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
624    brw_inst_set_saturate(devinfo, insn, state->saturate);
625    brw_inst_set_pred_control(devinfo, insn, state->predicate);
626    brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
627 
628    if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
629        state->access_mode == BRW_ALIGN_16) {
630       brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
631       if (devinfo->ver >= 7)
632          brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
633    } else {
634       brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
635       if (devinfo->ver >= 7)
636          brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
637    }
638 
639    if (devinfo->ver >= 6)
640       brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
641 }
642 
643 static brw_inst *
brw_append_insns(struct brw_codegen * p,unsigned nr_insn,unsigned align)644 brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
645 {
646    assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
647    assert(util_is_power_of_two_or_zero(align));
648    const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
649    const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
650    const unsigned new_nr_insn = start_insn + nr_insn;
651 
652    if (p->store_size < new_nr_insn) {
653       p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
654       p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
655    }
656 
657    /* Memset any padding due to alignment to 0.  We don't want to be hashing
658     * or caching a bunch of random bits we got from a memory allocation.
659     */
660    if (p->nr_insn < start_insn) {
661       memset(&p->store[p->nr_insn], 0,
662              (start_insn - p->nr_insn) * sizeof(brw_inst));
663    }
664 
665    assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
666    p->nr_insn = new_nr_insn;
667    p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
668 
669    return &p->store[start_insn];
670 }
671 
672 void
brw_realign(struct brw_codegen * p,unsigned align)673 brw_realign(struct brw_codegen *p, unsigned align)
674 {
675    brw_append_insns(p, 0, align);
676 }
677 
678 int
brw_append_data(struct brw_codegen * p,void * data,unsigned size,unsigned align)679 brw_append_data(struct brw_codegen *p, void *data,
680                 unsigned size, unsigned align)
681 {
682    unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
683    void *dst = brw_append_insns(p, nr_insn, align);
684    memcpy(dst, data, size);
685 
686    /* If it's not a whole number of instructions, memset the end */
687    if (size < nr_insn * sizeof(brw_inst))
688       memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
689 
690    return dst - (void *)p->store;
691 }
692 
693 #define next_insn brw_next_insn
694 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)695 brw_next_insn(struct brw_codegen *p, unsigned opcode)
696 {
697    const struct intel_device_info *devinfo = p->devinfo;
698    brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
699 
700    memset(insn, 0, sizeof(*insn));
701    brw_inst_set_opcode(devinfo, insn, opcode);
702 
703    /* Apply the default instruction state */
704    brw_inst_set_state(devinfo, insn, p->current);
705 
706    return insn;
707 }
708 
709 void
brw_add_reloc(struct brw_codegen * p,uint32_t id,enum brw_shader_reloc_type type,uint32_t offset,uint32_t delta)710 brw_add_reloc(struct brw_codegen *p, uint32_t id,
711               enum brw_shader_reloc_type type,
712               uint32_t offset, uint32_t delta)
713 {
714    if (p->num_relocs + 1 > p->reloc_array_size) {
715       p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
716       p->relocs = reralloc(p->mem_ctx, p->relocs,
717                            struct brw_shader_reloc, p->reloc_array_size);
718    }
719 
720    p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
721       .id = id,
722       .type = type,
723       .offset = offset,
724       .delta = delta,
725    };
726 }
727 
728 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)729 brw_alu1(struct brw_codegen *p, unsigned opcode,
730          struct brw_reg dest, struct brw_reg src)
731 {
732    brw_inst *insn = next_insn(p, opcode);
733    brw_set_dest(p, insn, dest);
734    brw_set_src0(p, insn, src);
735    return insn;
736 }
737 
738 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)739 brw_alu2(struct brw_codegen *p, unsigned opcode,
740          struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
741 {
742    /* 64-bit immediates are only supported on 1-src instructions */
743    assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
744    assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
745 
746    brw_inst *insn = next_insn(p, opcode);
747    brw_set_dest(p, insn, dest);
748    brw_set_src0(p, insn, src0);
749    brw_set_src1(p, insn, src1);
750    return insn;
751 }
752 
753 static int
get_3src_subreg_nr(struct brw_reg reg)754 get_3src_subreg_nr(struct brw_reg reg)
755 {
756    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
757     * use 32-bit units (components 0..7).  Since they only support F/D/UD
758     * types, this doesn't lose any flexibility, but uses fewer bits.
759     */
760    return reg.subnr / 4;
761 }
762 
763 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum brw_vertical_stride vstride)764 to_3src_align1_vstride(const struct intel_device_info *devinfo,
765                        enum brw_vertical_stride vstride)
766 {
767    switch (vstride) {
768    case BRW_VERTICAL_STRIDE_0:
769       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
770    case BRW_VERTICAL_STRIDE_1:
771       assert(devinfo->ver >= 12);
772       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
773    case BRW_VERTICAL_STRIDE_2:
774       assert(devinfo->ver < 12);
775       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
776    case BRW_VERTICAL_STRIDE_4:
777       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
778    case BRW_VERTICAL_STRIDE_8:
779    case BRW_VERTICAL_STRIDE_16:
780       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
781    default:
782       unreachable("invalid vstride");
783    }
784 }
785 
786 
787 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)788 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
789 {
790    switch (hstride) {
791    case BRW_HORIZONTAL_STRIDE_0:
792       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
793    case BRW_HORIZONTAL_STRIDE_1:
794       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
795    case BRW_HORIZONTAL_STRIDE_2:
796       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
797    case BRW_HORIZONTAL_STRIDE_4:
798       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
799    default:
800       unreachable("invalid hstride");
801    }
802 }
803 
804 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)805 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
806          struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
807 {
808    const struct intel_device_info *devinfo = p->devinfo;
809    brw_inst *inst = next_insn(p, opcode);
810 
811    gfx7_convert_mrf_to_grf(p, &dest);
812 
813    assert(dest.nr < 128);
814 
815    if (devinfo->ver >= 10)
816       assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
817                src2.file == BRW_IMMEDIATE_VALUE));
818 
819    assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
820    assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
821    assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
822    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
823    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
824    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
825    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
826 
827    if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
828       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
829              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
830 
831       if (devinfo->ver >= 12) {
832          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
833          brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
834       } else {
835          if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
836             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
837                                               BRW_ALIGN1_3SRC_ACCUMULATOR);
838             brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
839          } else {
840             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
841                                               BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
842             brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
843          }
844       }
845       brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
846 
847       brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
848 
849       if (brw_reg_type_is_floating_point(dest.type)) {
850          brw_inst_set_3src_a1_exec_type(devinfo, inst,
851                                         BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
852       } else {
853          brw_inst_set_3src_a1_exec_type(devinfo, inst,
854                                         BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
855       }
856 
857       brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
858       brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
859       brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
860       brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
861 
862       if (src0.file == BRW_IMMEDIATE_VALUE) {
863          brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
864       } else {
865          brw_inst_set_3src_a1_src0_vstride(
866             devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
867          brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
868                                            to_3src_align1_hstride(src0.hstride));
869          brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
870          if (src0.type == BRW_REGISTER_TYPE_NF) {
871             brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
872          } else {
873             brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
874          }
875          brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
876          brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
877       }
878       brw_inst_set_3src_a1_src1_vstride(
879          devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
880       brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
881                                         to_3src_align1_hstride(src1.hstride));
882 
883       brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
884       if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
885          brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
886       } else {
887          brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
888       }
889       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
890       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
891 
892       if (src2.file == BRW_IMMEDIATE_VALUE) {
893          brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
894       } else {
895          brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
896                                            to_3src_align1_hstride(src2.hstride));
897          /* no vstride on src2 */
898          brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
899          brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
900          brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
901          brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
902       }
903 
904       assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
905              src0.file == BRW_IMMEDIATE_VALUE ||
906              (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
907               src0.type == BRW_REGISTER_TYPE_NF));
908       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
909              src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
910       assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
911              src2.file == BRW_IMMEDIATE_VALUE);
912 
913       if (devinfo->ver >= 12) {
914          if (src0.file == BRW_IMMEDIATE_VALUE) {
915             brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
916          } else {
917             brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
918          }
919 
920          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
921 
922          if (src2.file == BRW_IMMEDIATE_VALUE) {
923             brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
924          } else {
925             brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
926          }
927       } else {
928          brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
929                                             src0.file == BRW_GENERAL_REGISTER_FILE ?
930                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
931                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
932          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
933                                             src1.file == BRW_GENERAL_REGISTER_FILE ?
934                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
935                                             BRW_ALIGN1_3SRC_ACCUMULATOR);
936          brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
937                                             src2.file == BRW_GENERAL_REGISTER_FILE ?
938                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
939                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
940       }
941 
942    } else {
943       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
944              dest.file == BRW_MESSAGE_REGISTER_FILE);
945       assert(dest.type == BRW_REGISTER_TYPE_F  ||
946              dest.type == BRW_REGISTER_TYPE_DF ||
947              dest.type == BRW_REGISTER_TYPE_D  ||
948              dest.type == BRW_REGISTER_TYPE_UD ||
949              (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));
950       if (devinfo->ver == 6) {
951          brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
952                                             dest.file == BRW_MESSAGE_REGISTER_FILE);
953       }
954       brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
955       brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
956       brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
957 
958       assert(src0.file == BRW_GENERAL_REGISTER_FILE);
959       brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
960       brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
961       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
962       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
963       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
964       brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
965                                           src0.vstride == BRW_VERTICAL_STRIDE_0);
966 
967       assert(src1.file == BRW_GENERAL_REGISTER_FILE);
968       brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
969       brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
970       brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
971       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
972       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
973       brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
974                                           src1.vstride == BRW_VERTICAL_STRIDE_0);
975 
976       assert(src2.file == BRW_GENERAL_REGISTER_FILE);
977       brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
978       brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
979       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
980       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
981       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
982       brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
983                                           src2.vstride == BRW_VERTICAL_STRIDE_0);
984 
985       if (devinfo->ver >= 7) {
986          /* Set both the source and destination types based on dest.type,
987           * ignoring the source register types.  The MAD and LRP emitters ensure
988           * that all four types are float.  The BFE and BFI2 emitters, however,
989           * may send us mixed D and UD types and want us to ignore that and use
990           * the destination type.
991           */
992          brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
993          brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
994 
995          /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
996           *
997           *    "Three source instructions can use operands with mixed-mode
998           *     precision. When SrcType field is set to :f or :hf it defines
999           *     precision for source 0 only, and fields Src1Type and Src2Type
1000           *     define precision for other source operands:
1001           *
1002           *     0b = :f. Single precision Float (32-bit).
1003           *     1b = :hf. Half precision Float (16-bit)."
1004           */
1005          if (src1.type == BRW_REGISTER_TYPE_HF)
1006             brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1007 
1008          if (src2.type == BRW_REGISTER_TYPE_HF)
1009             brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1010       }
1011    }
1012 
1013    return inst;
1014 }
1015 
1016 
1017 /***********************************************************************
1018  * Convenience routines.
1019  */
1020 #define ALU1(OP)					\
1021 brw_inst *brw_##OP(struct brw_codegen *p,		\
1022 	      struct brw_reg dest,			\
1023 	      struct brw_reg src0)   			\
1024 {							\
1025    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
1026 }
1027 
1028 #define ALU2(OP)					\
1029 brw_inst *brw_##OP(struct brw_codegen *p,		\
1030 	      struct brw_reg dest,			\
1031 	      struct brw_reg src0,			\
1032 	      struct brw_reg src1)   			\
1033 {							\
1034    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
1035 }
1036 
1037 #define ALU3(OP)					\
1038 brw_inst *brw_##OP(struct brw_codegen *p,		\
1039 	      struct brw_reg dest,			\
1040 	      struct brw_reg src0,			\
1041 	      struct brw_reg src1,			\
1042 	      struct brw_reg src2)   			\
1043 {                                                       \
1044    if (p->current->access_mode == BRW_ALIGN_16) {       \
1045       if (src0.vstride == BRW_VERTICAL_STRIDE_0)        \
1046          src0.swizzle = BRW_SWIZZLE_XXXX;               \
1047       if (src1.vstride == BRW_VERTICAL_STRIDE_0)        \
1048          src1.swizzle = BRW_SWIZZLE_XXXX;               \
1049       if (src2.vstride == BRW_VERTICAL_STRIDE_0)        \
1050          src2.swizzle = BRW_SWIZZLE_XXXX;               \
1051    }                                                    \
1052    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
1053 }
1054 
1055 #define ALU3F(OP)                                               \
1056 brw_inst *brw_##OP(struct brw_codegen *p,         \
1057                                  struct brw_reg dest,           \
1058                                  struct brw_reg src0,           \
1059                                  struct brw_reg src1,           \
1060                                  struct brw_reg src2)           \
1061 {                                                               \
1062    assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
1063           dest.type == BRW_REGISTER_TYPE_DF);                   \
1064    if (dest.type == BRW_REGISTER_TYPE_F) {                      \
1065       assert(src0.type == BRW_REGISTER_TYPE_F);                 \
1066       assert(src1.type == BRW_REGISTER_TYPE_F);                 \
1067       assert(src2.type == BRW_REGISTER_TYPE_F);                 \
1068    } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
1069       assert(src0.type == BRW_REGISTER_TYPE_DF);                \
1070       assert(src1.type == BRW_REGISTER_TYPE_DF);                \
1071       assert(src2.type == BRW_REGISTER_TYPE_DF);                \
1072    }                                                            \
1073                                                                 \
1074    if (p->current->access_mode == BRW_ALIGN_16) {               \
1075       if (src0.vstride == BRW_VERTICAL_STRIDE_0)                \
1076          src0.swizzle = BRW_SWIZZLE_XXXX;                       \
1077       if (src1.vstride == BRW_VERTICAL_STRIDE_0)                \
1078          src1.swizzle = BRW_SWIZZLE_XXXX;                       \
1079       if (src2.vstride == BRW_VERTICAL_STRIDE_0)                \
1080          src2.swizzle = BRW_SWIZZLE_XXXX;                       \
1081    }                                                            \
1082    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1083 }
1084 
1085 ALU2(SEL)
ALU1(NOT)1086 ALU1(NOT)
1087 ALU2(AND)
1088 ALU2(OR)
1089 ALU2(XOR)
1090 ALU2(SHR)
1091 ALU2(SHL)
1092 ALU1(DIM)
1093 ALU2(ASR)
1094 ALU2(ROL)
1095 ALU2(ROR)
1096 ALU3(CSEL)
1097 ALU1(FRC)
1098 ALU1(RNDD)
1099 ALU1(RNDE)
1100 ALU1(RNDU)
1101 ALU1(RNDZ)
1102 ALU2(MAC)
1103 ALU2(MACH)
1104 ALU1(LZD)
1105 ALU2(DP4)
1106 ALU2(DPH)
1107 ALU2(DP3)
1108 ALU2(DP2)
1109 ALU3(DP4A)
1110 ALU3(MAD)
1111 ALU3F(LRP)
1112 ALU1(BFREV)
1113 ALU3(BFE)
1114 ALU2(BFI1)
1115 ALU3(BFI2)
1116 ALU1(FBH)
1117 ALU1(FBL)
1118 ALU1(CBIT)
1119 ALU2(ADDC)
1120 ALU2(SUBB)
1121 ALU3(ADD3)
1122 
1123 brw_inst *
1124 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1125 {
1126    const struct intel_device_info *devinfo = p->devinfo;
1127 
1128    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1129     * To avoid the problems that causes, we use an <X,2,0> source region to
1130     * read each element twice.
1131     */
1132    if (devinfo->verx10 == 70 &&
1133        brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1134        dest.type == BRW_REGISTER_TYPE_DF &&
1135        (src0.type == BRW_REGISTER_TYPE_F ||
1136         src0.type == BRW_REGISTER_TYPE_D ||
1137         src0.type == BRW_REGISTER_TYPE_UD) &&
1138        !has_scalar_region(src0)) {
1139       assert(src0.vstride == src0.width + src0.hstride);
1140       src0.vstride = src0.hstride;
1141       src0.width = BRW_WIDTH_2;
1142       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1143    }
1144 
1145    return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1146 }
1147 
1148 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1149 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1150         struct brw_reg src0, struct brw_reg src1)
1151 {
1152    /* 6.2.2: add */
1153    if (src0.type == BRW_REGISTER_TYPE_F ||
1154        (src0.file == BRW_IMMEDIATE_VALUE &&
1155 	src0.type == BRW_REGISTER_TYPE_VF)) {
1156       assert(src1.type != BRW_REGISTER_TYPE_UD);
1157       assert(src1.type != BRW_REGISTER_TYPE_D);
1158    }
1159 
1160    if (src1.type == BRW_REGISTER_TYPE_F ||
1161        (src1.file == BRW_IMMEDIATE_VALUE &&
1162 	src1.type == BRW_REGISTER_TYPE_VF)) {
1163       assert(src0.type != BRW_REGISTER_TYPE_UD);
1164       assert(src0.type != BRW_REGISTER_TYPE_D);
1165    }
1166 
1167    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1168 }
1169 
1170 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1171 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1172         struct brw_reg src0, struct brw_reg src1)
1173 {
1174    assert(dest.type == src0.type);
1175    assert(src0.type == src1.type);
1176    switch (src0.type) {
1177    case BRW_REGISTER_TYPE_B:
1178    case BRW_REGISTER_TYPE_UB:
1179    case BRW_REGISTER_TYPE_W:
1180    case BRW_REGISTER_TYPE_UW:
1181    case BRW_REGISTER_TYPE_D:
1182    case BRW_REGISTER_TYPE_UD:
1183       break;
1184    default:
1185       unreachable("Bad type for brw_AVG");
1186    }
1187 
1188    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1189 }
1190 
1191 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1192 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1193         struct brw_reg src0, struct brw_reg src1)
1194 {
1195    /* 6.32.38: mul */
1196    if (src0.type == BRW_REGISTER_TYPE_D ||
1197        src0.type == BRW_REGISTER_TYPE_UD ||
1198        src1.type == BRW_REGISTER_TYPE_D ||
1199        src1.type == BRW_REGISTER_TYPE_UD) {
1200       assert(dest.type != BRW_REGISTER_TYPE_F);
1201    }
1202 
1203    if (src0.type == BRW_REGISTER_TYPE_F ||
1204        (src0.file == BRW_IMMEDIATE_VALUE &&
1205 	src0.type == BRW_REGISTER_TYPE_VF)) {
1206       assert(src1.type != BRW_REGISTER_TYPE_UD);
1207       assert(src1.type != BRW_REGISTER_TYPE_D);
1208    }
1209 
1210    if (src1.type == BRW_REGISTER_TYPE_F ||
1211        (src1.file == BRW_IMMEDIATE_VALUE &&
1212 	src1.type == BRW_REGISTER_TYPE_VF)) {
1213       assert(src0.type != BRW_REGISTER_TYPE_UD);
1214       assert(src0.type != BRW_REGISTER_TYPE_D);
1215    }
1216 
1217    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1218 	  src0.nr != BRW_ARF_ACCUMULATOR);
1219    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1220 	  src1.nr != BRW_ARF_ACCUMULATOR);
1221 
1222    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1223 }
1224 
1225 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1226 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1227          struct brw_reg src0, struct brw_reg src1)
1228 {
1229    src0.vstride = BRW_VERTICAL_STRIDE_0;
1230    src0.width = BRW_WIDTH_1;
1231    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1232    return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1233 }
1234 
1235 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1236 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1237         struct brw_reg src0, struct brw_reg src1)
1238 {
1239    src0.vstride = BRW_VERTICAL_STRIDE_0;
1240    src0.width = BRW_WIDTH_1;
1241    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1242    src1.vstride = BRW_VERTICAL_STRIDE_8;
1243    src1.width = BRW_WIDTH_8;
1244    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1245    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1246 }
1247 
1248 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1249 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1250 {
1251    const struct intel_device_info *devinfo = p->devinfo;
1252    const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1253    /* The F32TO16 instruction doesn't support 32-bit destination types in
1254     * Align1 mode, and neither does the Gfx8 implementation in terms of a
1255     * converting MOV.  Gfx7 does zero out the high 16 bits in Align16 mode as
1256     * an undocumented feature.
1257     */
1258    const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1259                                  (!align16 || devinfo->ver >= 8));
1260    brw_inst *inst;
1261 
1262    if (align16) {
1263       assert(dst.type == BRW_REGISTER_TYPE_UD);
1264    } else {
1265       assert(dst.type == BRW_REGISTER_TYPE_UD ||
1266              dst.type == BRW_REGISTER_TYPE_W ||
1267              dst.type == BRW_REGISTER_TYPE_UW ||
1268              dst.type == BRW_REGISTER_TYPE_HF);
1269    }
1270 
1271    brw_push_insn_state(p);
1272 
1273    if (needs_zero_fill) {
1274       brw_set_default_access_mode(p, BRW_ALIGN_1);
1275       dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1276    }
1277 
1278    if (devinfo->ver >= 8) {
1279       inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1280    } else {
1281       assert(devinfo->ver == 7);
1282       inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1283    }
1284 
1285    if (needs_zero_fill) {
1286       if (devinfo->ver < 12)
1287          brw_inst_set_no_dd_clear(devinfo, inst, true);
1288       brw_set_default_swsb(p, tgl_swsb_null());
1289       inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1290       if (devinfo->ver < 12)
1291          brw_inst_set_no_dd_check(devinfo, inst, true);
1292    }
1293 
1294    brw_pop_insn_state(p);
1295    return inst;
1296 }
1297 
1298 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1299 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1300 {
1301    const struct intel_device_info *devinfo = p->devinfo;
1302    bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1303 
1304    if (align16) {
1305       assert(src.type == BRW_REGISTER_TYPE_UD);
1306    } else {
1307       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1308        *
1309        *   Because this instruction does not have a 16-bit floating-point
1310        *   type, the source data type must be Word (W). The destination type
1311        *   must be F (Float).
1312        */
1313       if (src.type == BRW_REGISTER_TYPE_UD)
1314          src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1315 
1316       assert(src.type == BRW_REGISTER_TYPE_W ||
1317              src.type == BRW_REGISTER_TYPE_UW ||
1318              src.type == BRW_REGISTER_TYPE_HF);
1319    }
1320 
1321    if (devinfo->ver >= 8) {
1322       return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1323    } else {
1324       assert(devinfo->ver == 7);
1325       return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1326    }
1327 }
1328 
1329 
brw_NOP(struct brw_codegen * p)1330 void brw_NOP(struct brw_codegen *p)
1331 {
1332    brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1333    memset(insn, 0, sizeof(*insn));
1334    brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1335 }
1336 
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1337 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1338 {
1339    brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1340    brw_inst_set_cond_modifier(p->devinfo, insn, func);
1341 }
1342 
1343 /***********************************************************************
1344  * Comparisons, if/else/endif
1345  */
1346 
1347 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1348 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1349          unsigned predicate_control)
1350 {
1351    const struct intel_device_info *devinfo = p->devinfo;
1352    struct brw_reg ip = brw_ip_reg();
1353    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1354 
1355    brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1356    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1357    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1358    brw_inst_set_pred_control(devinfo, inst, predicate_control);
1359 
1360    return inst;
1361 }
1362 
1363 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1364 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1365 {
1366    p->if_stack[p->if_stack_depth] = inst - p->store;
1367 
1368    p->if_stack_depth++;
1369    if (p->if_stack_array_size <= p->if_stack_depth) {
1370       p->if_stack_array_size *= 2;
1371       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1372 			     p->if_stack_array_size);
1373    }
1374 }
1375 
1376 static brw_inst *
pop_if_stack(struct brw_codegen * p)1377 pop_if_stack(struct brw_codegen *p)
1378 {
1379    p->if_stack_depth--;
1380    return &p->store[p->if_stack[p->if_stack_depth]];
1381 }
1382 
1383 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1384 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1385 {
1386    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1387       p->loop_stack_array_size *= 2;
1388       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1389 			       p->loop_stack_array_size);
1390       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1391 				     p->loop_stack_array_size);
1392    }
1393 
1394    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1395    p->loop_stack_depth++;
1396    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1397 }
1398 
1399 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1400 get_inner_do_insn(struct brw_codegen *p)
1401 {
1402    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1403 }
1404 
1405 /* EU takes the value from the flag register and pushes it onto some
1406  * sort of a stack (presumably merging with any flag value already on
1407  * the stack).  Within an if block, the flags at the top of the stack
1408  * control execution on each channel of the unit, eg. on each of the
1409  * 16 pixel values in our wm programs.
1410  *
1411  * When the matching 'else' instruction is reached (presumably by
1412  * countdown of the instruction count patched in by our ELSE/ENDIF
1413  * functions), the relevant flags are inverted.
1414  *
1415  * When the matching 'endif' instruction is reached, the flags are
1416  * popped off.  If the stack is now empty, normal execution resumes.
1417  */
1418 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1419 brw_IF(struct brw_codegen *p, unsigned execute_size)
1420 {
1421    const struct intel_device_info *devinfo = p->devinfo;
1422    brw_inst *insn;
1423 
1424    insn = next_insn(p, BRW_OPCODE_IF);
1425 
1426    /* Override the defaults for this instruction:
1427     */
1428    if (devinfo->ver < 6) {
1429       brw_set_dest(p, insn, brw_ip_reg());
1430       brw_set_src0(p, insn, brw_ip_reg());
1431       brw_set_src1(p, insn, brw_imm_d(0x0));
1432    } else if (devinfo->ver == 6) {
1433       brw_set_dest(p, insn, brw_imm_w(0));
1434       brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1435       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1436       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1437    } else if (devinfo->ver == 7) {
1438       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1439       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1440       brw_set_src1(p, insn, brw_imm_w(0));
1441       brw_inst_set_jip(devinfo, insn, 0);
1442       brw_inst_set_uip(devinfo, insn, 0);
1443    } else {
1444       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1445       if (devinfo->ver < 12)
1446          brw_set_src0(p, insn, brw_imm_d(0));
1447       brw_inst_set_jip(devinfo, insn, 0);
1448       brw_inst_set_uip(devinfo, insn, 0);
1449    }
1450 
1451    brw_inst_set_exec_size(devinfo, insn, execute_size);
1452    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1453    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1454    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1455    if (!p->single_program_flow && devinfo->ver < 6)
1456       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1457 
1458    push_if_stack(p, insn);
1459    p->if_depth_in_loop[p->loop_stack_depth]++;
1460    return insn;
1461 }
1462 
1463 /* This function is only used for gfx6-style IF instructions with an
1464  * embedded comparison (conditional modifier).  It is not used on gfx7.
1465  */
1466 brw_inst *
gfx6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1467 gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1468 	struct brw_reg src0, struct brw_reg src1)
1469 {
1470    const struct intel_device_info *devinfo = p->devinfo;
1471    brw_inst *insn;
1472 
1473    insn = next_insn(p, BRW_OPCODE_IF);
1474 
1475    brw_set_dest(p, insn, brw_imm_w(0));
1476    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1477    brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1478    brw_set_src0(p, insn, src0);
1479    brw_set_src1(p, insn, src1);
1480 
1481    assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1482    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1483    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1484 
1485    push_if_stack(p, insn);
1486    return insn;
1487 }
1488 
1489 /**
1490  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1491  */
1492 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1493 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1494                        brw_inst *if_inst, brw_inst *else_inst)
1495 {
1496    const struct intel_device_info *devinfo = p->devinfo;
1497 
1498    /* The next instruction (where the ENDIF would be, if it existed) */
1499    brw_inst *next_inst = &p->store[p->nr_insn];
1500 
1501    assert(p->single_program_flow);
1502    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1503    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1504    assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1505 
1506    /* Convert IF to an ADD instruction that moves the instruction pointer
1507     * to the first instruction of the ELSE block.  If there is no ELSE
1508     * block, point to where ENDIF would be.  Reverse the predicate.
1509     *
1510     * There's no need to execute an ENDIF since we don't need to do any
1511     * stack operations, and if we're currently executing, we just want to
1512     * continue normally.
1513     */
1514    brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1515    brw_inst_set_pred_inv(devinfo, if_inst, true);
1516 
1517    if (else_inst != NULL) {
1518       /* Convert ELSE to an ADD instruction that points where the ENDIF
1519        * would be.
1520        */
1521       brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1522 
1523       brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1524       brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1525    } else {
1526       brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1527    }
1528 }
1529 
1530 /**
1531  * Patch IF and ELSE instructions with appropriate jump targets.
1532  */
1533 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1534 patch_IF_ELSE(struct brw_codegen *p,
1535               brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1536 {
1537    const struct intel_device_info *devinfo = p->devinfo;
1538 
1539    /* We shouldn't be patching IF and ELSE instructions in single program flow
1540     * mode when gen < 6, because in single program flow mode on those
1541     * platforms, we convert flow control instructions to conditional ADDs that
1542     * operate on IP (see brw_ENDIF).
1543     *
1544     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1545     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1546     * not be updated by non-flow control instructions.").  And on later
1547     * platforms, there is no significant benefit to converting control flow
1548     * instructions to conditional ADDs.  So we do patch IF and ELSE
1549     * instructions in single program flow mode on those platforms.
1550     */
1551    if (devinfo->ver < 6)
1552       assert(!p->single_program_flow);
1553 
1554    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1555    assert(endif_inst != NULL);
1556    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1557 
1558    unsigned br = brw_jump_scale(devinfo);
1559 
1560    assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1561    brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1562 
1563    if (else_inst == NULL) {
1564       /* Patch IF -> ENDIF */
1565       if (devinfo->ver < 6) {
1566 	 /* Turn it into an IFF, which means no mask stack operations for
1567 	  * all-false and jumping past the ENDIF.
1568 	  */
1569          brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1570          brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1571                                       br * (endif_inst - if_inst + 1));
1572          brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1573       } else if (devinfo->ver == 6) {
1574 	 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1575          brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1576       } else {
1577          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1578          brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1579       }
1580    } else {
1581       brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1582 
1583       /* Patch IF -> ELSE */
1584       if (devinfo->ver < 6) {
1585          brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1586                                       br * (else_inst - if_inst));
1587          brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1588       } else if (devinfo->ver == 6) {
1589          brw_inst_set_gfx6_jump_count(devinfo, if_inst,
1590                                       br * (else_inst - if_inst + 1));
1591       }
1592 
1593       /* Patch ELSE -> ENDIF */
1594       if (devinfo->ver < 6) {
1595 	 /* BRW_OPCODE_ELSE pre-gfx6 should point just past the
1596 	  * matching ENDIF.
1597 	  */
1598          brw_inst_set_gfx4_jump_count(devinfo, else_inst,
1599                                       br * (endif_inst - else_inst + 1));
1600          brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1601       } else if (devinfo->ver == 6) {
1602 	 /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1603          brw_inst_set_gfx6_jump_count(devinfo, else_inst,
1604                                       br * (endif_inst - else_inst));
1605       } else {
1606 	 /* The IF instruction's JIP should point just past the ELSE */
1607          brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1608 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1609          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1610          brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1611          if (devinfo->ver >= 8) {
1612             /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1613              * should point to ENDIF.
1614              */
1615             brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1616          }
1617       }
1618    }
1619 }
1620 
1621 void
brw_ELSE(struct brw_codegen * p)1622 brw_ELSE(struct brw_codegen *p)
1623 {
1624    const struct intel_device_info *devinfo = p->devinfo;
1625    brw_inst *insn;
1626 
1627    insn = next_insn(p, BRW_OPCODE_ELSE);
1628 
1629    if (devinfo->ver < 6) {
1630       brw_set_dest(p, insn, brw_ip_reg());
1631       brw_set_src0(p, insn, brw_ip_reg());
1632       brw_set_src1(p, insn, brw_imm_d(0x0));
1633    } else if (devinfo->ver == 6) {
1634       brw_set_dest(p, insn, brw_imm_w(0));
1635       brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1636       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1637       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638    } else if (devinfo->ver == 7) {
1639       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1640       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1641       brw_set_src1(p, insn, brw_imm_w(0));
1642       brw_inst_set_jip(devinfo, insn, 0);
1643       brw_inst_set_uip(devinfo, insn, 0);
1644    } else {
1645       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1646       if (devinfo->ver < 12)
1647          brw_set_src0(p, insn, brw_imm_d(0));
1648       brw_inst_set_jip(devinfo, insn, 0);
1649       brw_inst_set_uip(devinfo, insn, 0);
1650    }
1651 
1652    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1653    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1654    if (!p->single_program_flow && devinfo->ver < 6)
1655       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1656 
1657    push_if_stack(p, insn);
1658 }
1659 
1660 void
brw_ENDIF(struct brw_codegen * p)1661 brw_ENDIF(struct brw_codegen *p)
1662 {
1663    const struct intel_device_info *devinfo = p->devinfo;
1664    brw_inst *insn = NULL;
1665    brw_inst *else_inst = NULL;
1666    brw_inst *if_inst = NULL;
1667    brw_inst *tmp;
1668    bool emit_endif = true;
1669 
1670    /* In single program flow mode, we can express IF and ELSE instructions
1671     * equivalently as ADD instructions that operate on IP.  On platforms prior
1672     * to Gfx6, flow control instructions cause an implied thread switch, so
1673     * this is a significant savings.
1674     *
1675     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1676     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1677     * not be updated by non-flow control instructions.").  And on later
1678     * platforms, there is no significant benefit to converting control flow
1679     * instructions to conditional ADDs.  So we only do this trick on Gfx4 and
1680     * Gfx5.
1681     */
1682    if (devinfo->ver < 6 && p->single_program_flow)
1683       emit_endif = false;
1684 
1685    /*
1686     * A single next_insn() may change the base address of instruction store
1687     * memory(p->store), so call it first before referencing the instruction
1688     * store pointer from an index
1689     */
1690    if (emit_endif)
1691       insn = next_insn(p, BRW_OPCODE_ENDIF);
1692 
1693    /* Pop the IF and (optional) ELSE instructions from the stack */
1694    p->if_depth_in_loop[p->loop_stack_depth]--;
1695    tmp = pop_if_stack(p);
1696    if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1697       else_inst = tmp;
1698       tmp = pop_if_stack(p);
1699    }
1700    if_inst = tmp;
1701 
1702    if (!emit_endif) {
1703       /* ENDIF is useless; don't bother emitting it. */
1704       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1705       return;
1706    }
1707 
1708    if (devinfo->ver < 6) {
1709       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1710       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1711       brw_set_src1(p, insn, brw_imm_d(0x0));
1712    } else if (devinfo->ver == 6) {
1713       brw_set_dest(p, insn, brw_imm_w(0));
1714       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1715       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1716    } else if (devinfo->ver == 7) {
1717       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1718       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1719       brw_set_src1(p, insn, brw_imm_w(0));
1720    } else {
1721       brw_set_src0(p, insn, brw_imm_d(0));
1722    }
1723 
1724    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1725    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1726    if (devinfo->ver < 6)
1727       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1728 
1729    /* Also pop item off the stack in the endif instruction: */
1730    if (devinfo->ver < 6) {
1731       brw_inst_set_gfx4_jump_count(devinfo, insn, 0);
1732       brw_inst_set_gfx4_pop_count(devinfo, insn, 1);
1733    } else if (devinfo->ver == 6) {
1734       brw_inst_set_gfx6_jump_count(devinfo, insn, 2);
1735    } else {
1736       brw_inst_set_jip(devinfo, insn, 2);
1737    }
1738    patch_IF_ELSE(p, if_inst, else_inst, insn);
1739 }
1740 
1741 brw_inst *
brw_BREAK(struct brw_codegen * p)1742 brw_BREAK(struct brw_codegen *p)
1743 {
1744    const struct intel_device_info *devinfo = p->devinfo;
1745    brw_inst *insn;
1746 
1747    insn = next_insn(p, BRW_OPCODE_BREAK);
1748    if (devinfo->ver >= 8) {
1749       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1750       brw_set_src0(p, insn, brw_imm_d(0x0));
1751    } else if (devinfo->ver >= 6) {
1752       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1754       brw_set_src1(p, insn, brw_imm_d(0x0));
1755    } else {
1756       brw_set_dest(p, insn, brw_ip_reg());
1757       brw_set_src0(p, insn, brw_ip_reg());
1758       brw_set_src1(p, insn, brw_imm_d(0x0));
1759       brw_inst_set_gfx4_pop_count(devinfo, insn,
1760                                   p->if_depth_in_loop[p->loop_stack_depth]);
1761    }
1762    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1763    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1764 
1765    return insn;
1766 }
1767 
1768 brw_inst *
brw_CONT(struct brw_codegen * p)1769 brw_CONT(struct brw_codegen *p)
1770 {
1771    const struct intel_device_info *devinfo = p->devinfo;
1772    brw_inst *insn;
1773 
1774    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1775    brw_set_dest(p, insn, brw_ip_reg());
1776    if (devinfo->ver >= 8) {
1777       brw_set_src0(p, insn, brw_imm_d(0x0));
1778    } else {
1779       brw_set_src0(p, insn, brw_ip_reg());
1780       brw_set_src1(p, insn, brw_imm_d(0x0));
1781    }
1782 
1783    if (devinfo->ver < 6) {
1784       brw_inst_set_gfx4_pop_count(devinfo, insn,
1785                                   p->if_depth_in_loop[p->loop_stack_depth]);
1786    }
1787    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1788    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1789    return insn;
1790 }
1791 
1792 brw_inst *
brw_HALT(struct brw_codegen * p)1793 brw_HALT(struct brw_codegen *p)
1794 {
1795    const struct intel_device_info *devinfo = p->devinfo;
1796    brw_inst *insn;
1797 
1798    insn = next_insn(p, BRW_OPCODE_HALT);
1799    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1800    if (devinfo->ver < 6) {
1801       /* From the Gfx4 PRM:
1802        *
1803        *    "IP register must be put (for example, by the assembler) at <dst>
1804        *    and <src0> locations.
1805        */
1806       brw_set_dest(p, insn, brw_ip_reg());
1807       brw_set_src0(p, insn, brw_ip_reg());
1808       brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1809    } else if (devinfo->ver < 8) {
1810       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1811       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1812    } else if (devinfo->ver < 12) {
1813       brw_set_src0(p, insn, brw_imm_d(0x0));
1814    }
1815 
1816    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1817    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1818    return insn;
1819 }
1820 
1821 /* DO/WHILE loop:
1822  *
1823  * The DO/WHILE is just an unterminated loop -- break or continue are
1824  * used for control within the loop.  We have a few ways they can be
1825  * done.
1826  *
1827  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1828  * jip and no DO instruction.
1829  *
1830  * For non-uniform control flow pre-gfx6, there's a DO instruction to
1831  * push the mask, and a WHILE to jump back, and BREAK to get out and
1832  * pop the mask.
1833  *
1834  * For gfx6, there's no more mask stack, so no need for DO.  WHILE
1835  * just points back to the first instruction of the loop.
1836  */
1837 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1838 brw_DO(struct brw_codegen *p, unsigned execute_size)
1839 {
1840    const struct intel_device_info *devinfo = p->devinfo;
1841 
1842    if (devinfo->ver >= 6 || p->single_program_flow) {
1843       push_loop_stack(p, &p->store[p->nr_insn]);
1844       return &p->store[p->nr_insn];
1845    } else {
1846       brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1847 
1848       push_loop_stack(p, insn);
1849 
1850       /* Override the defaults for this instruction:
1851        */
1852       brw_set_dest(p, insn, brw_null_reg());
1853       brw_set_src0(p, insn, brw_null_reg());
1854       brw_set_src1(p, insn, brw_null_reg());
1855 
1856       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1857       brw_inst_set_exec_size(devinfo, insn, execute_size);
1858       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1859 
1860       return insn;
1861    }
1862 }
1863 
1864 /**
1865  * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1866  * instruction here.
1867  *
1868  * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1869  * nesting, since it can always just point to the end of the block/current loop.
1870  */
1871 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1872 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1873 {
1874    const struct intel_device_info *devinfo = p->devinfo;
1875    brw_inst *do_inst = get_inner_do_insn(p);
1876    brw_inst *inst;
1877    unsigned br = brw_jump_scale(devinfo);
1878 
1879    assert(devinfo->ver < 6);
1880 
1881    for (inst = while_inst - 1; inst != do_inst; inst--) {
1882       /* If the jump count is != 0, that means that this instruction has already
1883        * been patched because it's part of a loop inside of the one we're
1884        * patching.
1885        */
1886       if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1887           brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1888          brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1889       } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1890                  brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1891          brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1892       }
1893    }
1894 }
1895 
1896 brw_inst *
brw_WHILE(struct brw_codegen * p)1897 brw_WHILE(struct brw_codegen *p)
1898 {
1899    const struct intel_device_info *devinfo = p->devinfo;
1900    brw_inst *insn, *do_insn;
1901    unsigned br = brw_jump_scale(devinfo);
1902 
1903    if (devinfo->ver >= 6) {
1904       insn = next_insn(p, BRW_OPCODE_WHILE);
1905       do_insn = get_inner_do_insn(p);
1906 
1907       if (devinfo->ver >= 8) {
1908          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1909          if (devinfo->ver < 12)
1910             brw_set_src0(p, insn, brw_imm_d(0));
1911          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1912       } else if (devinfo->ver == 7) {
1913          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1914          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1915          brw_set_src1(p, insn, brw_imm_w(0));
1916          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1917       } else {
1918          brw_set_dest(p, insn, brw_imm_w(0));
1919          brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1920          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1921          brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1922       }
1923 
1924       brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1925 
1926    } else {
1927       if (p->single_program_flow) {
1928 	 insn = next_insn(p, BRW_OPCODE_ADD);
1929          do_insn = get_inner_do_insn(p);
1930 
1931 	 brw_set_dest(p, insn, brw_ip_reg());
1932 	 brw_set_src0(p, insn, brw_ip_reg());
1933 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1934          brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1935       } else {
1936 	 insn = next_insn(p, BRW_OPCODE_WHILE);
1937          do_insn = get_inner_do_insn(p);
1938 
1939          assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1940 
1941 	 brw_set_dest(p, insn, brw_ip_reg());
1942 	 brw_set_src0(p, insn, brw_ip_reg());
1943 	 brw_set_src1(p, insn, brw_imm_d(0));
1944 
1945          brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1946          brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1947          brw_inst_set_gfx4_pop_count(devinfo, insn, 0);
1948 
1949 	 brw_patch_break_cont(p, insn);
1950       }
1951    }
1952    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1953 
1954    p->loop_stack_depth--;
1955 
1956    return insn;
1957 }
1958 
1959 /* FORWARD JUMPS:
1960  */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1961 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1962 {
1963    const struct intel_device_info *devinfo = p->devinfo;
1964    brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1965    unsigned jmpi = 1;
1966 
1967    if (devinfo->ver >= 5)
1968       jmpi = 2;
1969 
1970    assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1971    assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1972 
1973    brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1974                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1975 }
1976 
1977 /* To integrate with the above, it makes sense that the comparison
1978  * instruction should populate the flag register.  It might be simpler
1979  * just to use the flag reg for most WM tasks?
1980  */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1981 void brw_CMP(struct brw_codegen *p,
1982 	     struct brw_reg dest,
1983 	     unsigned conditional,
1984 	     struct brw_reg src0,
1985 	     struct brw_reg src1)
1986 {
1987    const struct intel_device_info *devinfo = p->devinfo;
1988    brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1989 
1990    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1991    brw_set_dest(p, insn, dest);
1992    brw_set_src0(p, insn, src0);
1993    brw_set_src1(p, insn, src1);
1994 
1995    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1996     * page says:
1997     *    "Any CMP instruction with a null destination must use a {switch}."
1998     *
1999     * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
2000     * mentioned on their work-arounds pages.
2001     */
2002    if (devinfo->ver == 7) {
2003       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2004           dest.nr == BRW_ARF_NULL) {
2005          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2006       }
2007    }
2008 }
2009 
brw_CMPN(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)2010 void brw_CMPN(struct brw_codegen *p,
2011               struct brw_reg dest,
2012               unsigned conditional,
2013               struct brw_reg src0,
2014               struct brw_reg src1)
2015 {
2016    const struct intel_device_info *devinfo = p->devinfo;
2017    brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
2018 
2019    brw_inst_set_cond_modifier(devinfo, insn, conditional);
2020    brw_set_dest(p, insn, dest);
2021    brw_set_src0(p, insn, src0);
2022    brw_set_src1(p, insn, src1);
2023 
2024    /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2025     * says:
2026     *
2027     *    If the destination is the null register, the {Switch} instruction
2028     *    option must be used.
2029     *
2030     * Page 77 of the Haswell PRM Volume 2b contains the same text.
2031     */
2032    if (devinfo->ver == 7) {
2033       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2034           dest.nr == BRW_ARF_NULL) {
2035          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2036       }
2037    }
2038 }
2039 
2040 /***********************************************************************
2041  * Helpers for the various SEND message types:
2042  */
2043 
2044 /** Extended math function, float[8].
2045  */
gfx4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)2046 void gfx4_math(struct brw_codegen *p,
2047 	       struct brw_reg dest,
2048 	       unsigned function,
2049 	       unsigned msg_reg_nr,
2050 	       struct brw_reg src,
2051 	       unsigned precision )
2052 {
2053    const struct intel_device_info *devinfo = p->devinfo;
2054    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2055    unsigned data_type;
2056    if (has_scalar_region(src)) {
2057       data_type = BRW_MATH_DATA_SCALAR;
2058    } else {
2059       data_type = BRW_MATH_DATA_VECTOR;
2060    }
2061 
2062    assert(devinfo->ver < 6);
2063 
2064    /* Example code doesn't set predicate_control for send
2065     * instructions.
2066     */
2067    brw_inst_set_pred_control(devinfo, insn, 0);
2068    brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2069 
2070    brw_set_dest(p, insn, dest);
2071    brw_set_src0(p, insn, src);
2072    brw_set_math_message(p,
2073                         insn,
2074                         function,
2075                         src.type == BRW_REGISTER_TYPE_D,
2076                         precision,
2077                         data_type);
2078 }
2079 
gfx6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)2080 void gfx6_math(struct brw_codegen *p,
2081 	       struct brw_reg dest,
2082 	       unsigned function,
2083 	       struct brw_reg src0,
2084 	       struct brw_reg src1)
2085 {
2086    const struct intel_device_info *devinfo = p->devinfo;
2087    brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2088 
2089    assert(devinfo->ver >= 6);
2090 
2091    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2092           (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2093 
2094    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2095    if (devinfo->ver == 6) {
2096       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2097       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2098    }
2099 
2100    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2101        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2102        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2103       assert(src0.type != BRW_REGISTER_TYPE_F);
2104       assert(src1.type != BRW_REGISTER_TYPE_F);
2105       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2106              (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2107       /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
2108        *     INT DIV function does not support source modifiers.
2109        */
2110       assert(!src0.negate);
2111       assert(!src0.abs);
2112       assert(!src1.negate);
2113       assert(!src1.abs);
2114    } else {
2115       assert(src0.type == BRW_REGISTER_TYPE_F ||
2116              (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2117       assert(src1.type == BRW_REGISTER_TYPE_F ||
2118              (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2119    }
2120 
2121    /* Source modifiers are ignored for extended math instructions on Gfx6. */
2122    if (devinfo->ver == 6) {
2123       assert(!src0.negate);
2124       assert(!src0.abs);
2125       assert(!src1.negate);
2126       assert(!src1.abs);
2127    }
2128 
2129    brw_inst_set_math_function(devinfo, insn, function);
2130 
2131    brw_set_dest(p, insn, dest);
2132    brw_set_src0(p, insn, src0);
2133    brw_set_src1(p, insn, src1);
2134 }
2135 
2136 /**
2137  * Return the right surface index to access the thread scratch space using
2138  * stateless dataport messages.
2139  */
2140 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)2141 brw_scratch_surface_idx(const struct brw_codegen *p)
2142 {
2143    /* The scratch space is thread-local so IA coherency is unnecessary. */
2144    if (p->devinfo->ver >= 8)
2145       return GFX8_BTI_STATELESS_NON_COHERENT;
2146    else
2147       return BRW_BTI_STATELESS;
2148 }
2149 
2150 /**
2151  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2152  * using a constant offset per channel.
2153  *
2154  * The offset must be aligned to oword size (16 bytes).  Used for
2155  * register spilling.
2156  */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)2157 void brw_oword_block_write_scratch(struct brw_codegen *p,
2158 				   struct brw_reg mrf,
2159 				   int num_regs,
2160 				   unsigned offset)
2161 {
2162    const struct intel_device_info *devinfo = p->devinfo;
2163    const unsigned target_cache =
2164       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2165        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2166        BRW_SFID_DATAPORT_WRITE);
2167    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2168    uint32_t msg_type;
2169 
2170    if (devinfo->ver >= 6)
2171       offset /= 16;
2172 
2173    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2174 
2175    const unsigned mlen = 1 + num_regs;
2176 
2177    /* Set up the message header.  This is g0, with g0.2 filled with
2178     * the offset.  We don't want to leave our offset around in g0 or
2179     * it'll screw up texture samples, so set it up inside the message
2180     * reg.
2181     */
2182    {
2183       brw_push_insn_state(p);
2184       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2185       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2186       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2187       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2188 
2189       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2190 
2191       /* set message header global offset field (reg 0, element 2) */
2192       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2193       brw_set_default_swsb(p, tgl_swsb_null());
2194       brw_MOV(p,
2195 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2196 				  mrf.nr,
2197 				  2), BRW_REGISTER_TYPE_UD),
2198 	      brw_imm_ud(offset));
2199 
2200       brw_pop_insn_state(p);
2201       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2202    }
2203 
2204    {
2205       struct brw_reg dest;
2206       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2207       int send_commit_msg;
2208       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2209 					 BRW_REGISTER_TYPE_UW);
2210 
2211       brw_inst_set_sfid(devinfo, insn, target_cache);
2212       brw_inst_set_compression(devinfo, insn, false);
2213 
2214       if (brw_inst_exec_size(devinfo, insn) >= 16)
2215 	 src_header = vec16(src_header);
2216 
2217       assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2218       if (devinfo->ver < 6)
2219          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2220 
2221       /* Until gfx6, writes followed by reads from the same location
2222        * are not guaranteed to be ordered unless write_commit is set.
2223        * If set, then a no-op write is issued to the destination
2224        * register to set a dependency, and a read from the destination
2225        * can be used to ensure the ordering.
2226        *
2227        * For gfx6, only writes between different threads need ordering
2228        * protection.  Our use of DP writes is all about register
2229        * spilling within a thread.
2230        */
2231       if (devinfo->ver >= 6) {
2232 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2233 	 send_commit_msg = 0;
2234       } else {
2235 	 dest = src_header;
2236 	 send_commit_msg = 1;
2237       }
2238 
2239       brw_set_dest(p, insn, dest);
2240       if (devinfo->ver >= 6) {
2241 	 brw_set_src0(p, insn, mrf);
2242       } else {
2243 	 brw_set_src0(p, insn, brw_null_reg());
2244       }
2245 
2246       if (devinfo->ver >= 6)
2247 	 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2248       else
2249 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2250 
2251       brw_set_desc(p, insn,
2252                    brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2253                    brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2254                                      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2255                                      msg_type, send_commit_msg));
2256    }
2257 }
2258 
2259 
2260 /**
2261  * Read a block of owords (half a GRF each) from the scratch buffer
2262  * using a constant index per channel.
2263  *
2264  * Offset must be aligned to oword size (16 bytes).  Used for register
2265  * spilling.
2266  */
2267 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2268 brw_oword_block_read_scratch(struct brw_codegen *p,
2269 			     struct brw_reg dest,
2270 			     struct brw_reg mrf,
2271 			     int num_regs,
2272 			     unsigned offset)
2273 {
2274    const struct intel_device_info *devinfo = p->devinfo;
2275    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2276 
2277    if (devinfo->ver >= 6)
2278       offset /= 16;
2279 
2280    if (p->devinfo->ver >= 7) {
2281       /* On gen 7 and above, we no longer have message registers and we can
2282        * send from any register we want.  By using the destination register
2283        * for the message, we guarantee that the implied message write won't
2284        * accidentally overwrite anything.  This has been a problem because
2285        * the MRF registers and source for the final FB write are both fixed
2286        * and may overlap.
2287        */
2288       mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2289    } else {
2290       mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2291    }
2292    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2293 
2294    const unsigned rlen = num_regs;
2295    const unsigned target_cache =
2296       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2297        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2298        BRW_SFID_DATAPORT_READ);
2299 
2300    {
2301       brw_push_insn_state(p);
2302       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2303       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2304       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2305       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2306 
2307       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2308 
2309       /* set message header global offset field (reg 0, element 2) */
2310       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2311       brw_set_default_swsb(p, tgl_swsb_null());
2312       brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2313 
2314       brw_pop_insn_state(p);
2315       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2316    }
2317 
2318    {
2319       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2320 
2321       brw_inst_set_sfid(devinfo, insn, target_cache);
2322       assert(brw_inst_pred_control(devinfo, insn) == 0);
2323       brw_inst_set_compression(devinfo, insn, false);
2324 
2325       brw_set_dest(p, insn, dest);	/* UW? */
2326       if (devinfo->ver >= 6) {
2327 	 brw_set_src0(p, insn, mrf);
2328       } else {
2329 	 brw_set_src0(p, insn, brw_null_reg());
2330          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2331       }
2332 
2333       brw_set_desc(p, insn,
2334                    brw_message_desc(devinfo, 1, rlen, true) |
2335                    brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2336                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2337                                     BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2338                                     BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2339    }
2340 }
2341 
2342 void
gfx7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2343 gfx7_block_read_scratch(struct brw_codegen *p,
2344                         struct brw_reg dest,
2345                         int num_regs,
2346                         unsigned offset)
2347 {
2348    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2349    assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2350 
2351    brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2352 
2353    /* The HW requires that the header is present; this is to get the g0.5
2354     * scratch offset.
2355     */
2356    brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2357 
2358    /* According to the docs, offset is "A 12-bit HWord offset into the memory
2359     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2360     * is 32 bytes, which happens to be the size of a register.
2361     */
2362    offset /= REG_SIZE;
2363    assert(offset < (1 << 12));
2364 
2365    gfx7_set_dp_scratch_message(p, insn,
2366                                false, /* scratch read */
2367                                false, /* OWords */
2368                                false, /* invalidate after read */
2369                                num_regs,
2370                                offset,
2371                                1,        /* mlen: just g0 */
2372                                num_regs, /* rlen */
2373                                true);    /* header present */
2374 }
2375 
2376 /**
2377  * Read float[4] vectors from the data port constant cache.
2378  * Location (in buffer) should be a multiple of 16.
2379  * Used for fetching shader constants.
2380  */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2381 void brw_oword_block_read(struct brw_codegen *p,
2382 			  struct brw_reg dest,
2383 			  struct brw_reg mrf,
2384 			  uint32_t offset,
2385 			  uint32_t bind_table_index)
2386 {
2387    const struct intel_device_info *devinfo = p->devinfo;
2388    const unsigned target_cache =
2389       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2390        BRW_SFID_DATAPORT_READ);
2391    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2392    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2393 
2394    /* On newer hardware, offset is in units of owords. */
2395    if (devinfo->ver >= 6)
2396       offset /= 16;
2397 
2398    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2399 
2400    brw_push_insn_state(p);
2401    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2402    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2403    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2404 
2405    brw_push_insn_state(p);
2406    brw_set_default_exec_size(p, BRW_EXECUTE_8);
2407    brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2408    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2409 
2410    /* set message header global offset field (reg 0, element 2) */
2411    brw_set_default_exec_size(p, BRW_EXECUTE_1);
2412    brw_set_default_swsb(p, tgl_swsb_null());
2413    brw_MOV(p,
2414 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2415 			       mrf.nr,
2416 			       2), BRW_REGISTER_TYPE_UD),
2417 	   brw_imm_ud(offset));
2418    brw_pop_insn_state(p);
2419 
2420    brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2421 
2422    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2423 
2424    brw_inst_set_sfid(devinfo, insn, target_cache);
2425 
2426    /* cast dest to a uword[8] vector */
2427    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2428 
2429    brw_set_dest(p, insn, dest);
2430    if (devinfo->ver >= 6) {
2431       brw_set_src0(p, insn, mrf);
2432    } else {
2433       brw_set_src0(p, insn, brw_null_reg());
2434       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2435    }
2436 
2437    brw_set_desc(p, insn,
2438                 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2439                 brw_dp_read_desc(devinfo, bind_table_index,
2440                                  BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2441                                  BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2442                                  BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2443 
2444    brw_pop_insn_state(p);
2445 }
2446 
2447 brw_inst *
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2448 brw_fb_WRITE(struct brw_codegen *p,
2449              struct brw_reg payload,
2450              struct brw_reg implied_header,
2451              unsigned msg_control,
2452              unsigned binding_table_index,
2453              unsigned msg_length,
2454              unsigned response_length,
2455              bool eot,
2456              bool last_render_target,
2457              bool header_present)
2458 {
2459    const struct intel_device_info *devinfo = p->devinfo;
2460    const unsigned target_cache =
2461       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2462        BRW_SFID_DATAPORT_WRITE);
2463    brw_inst *insn;
2464    struct brw_reg dest, src0;
2465 
2466    if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2467       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2468    else
2469       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2470 
2471    if (devinfo->ver >= 6) {
2472       insn = next_insn(p, BRW_OPCODE_SENDC);
2473    } else {
2474       insn = next_insn(p, BRW_OPCODE_SEND);
2475    }
2476    brw_inst_set_sfid(devinfo, insn, target_cache);
2477    brw_inst_set_compression(devinfo, insn, false);
2478 
2479    if (devinfo->ver >= 6) {
2480       /* headerless version, just submit color payload */
2481       src0 = payload;
2482    } else {
2483       assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2484       brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2485       src0 = implied_header;
2486    }
2487 
2488    brw_set_dest(p, insn, dest);
2489    brw_set_src0(p, insn, src0);
2490    brw_set_desc(p, insn,
2491                 brw_message_desc(devinfo, msg_length, response_length,
2492                                  header_present) |
2493                 brw_fb_write_desc(devinfo, binding_table_index, msg_control,
2494                                   last_render_target,
2495                                   false /* coarse_write */));
2496    brw_inst_set_eot(devinfo, insn, eot);
2497 
2498    return insn;
2499 }
2500 
2501 brw_inst *
gfx9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2502 gfx9_fb_READ(struct brw_codegen *p,
2503              struct brw_reg dst,
2504              struct brw_reg payload,
2505              unsigned binding_table_index,
2506              unsigned msg_length,
2507              unsigned response_length,
2508              bool per_sample)
2509 {
2510    const struct intel_device_info *devinfo = p->devinfo;
2511    assert(devinfo->ver >= 9);
2512    brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2513 
2514    brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2515    brw_set_dest(p, insn, dst);
2516    brw_set_src0(p, insn, payload);
2517    brw_set_desc(
2518       p, insn,
2519       brw_message_desc(devinfo, msg_length, response_length, true) |
2520       brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2521                        1 << brw_get_default_exec_size(p), per_sample));
2522    brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2523 
2524    return insn;
2525 }
2526 
2527 /**
2528  * Texture sample instruction.
2529  * Note: the msg_type plus msg_length values determine exactly what kind
2530  * of sampling operation is performed.  See volume 4, page 161 of docs.
2531  */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2532 void brw_SAMPLE(struct brw_codegen *p,
2533 		struct brw_reg dest,
2534 		unsigned msg_reg_nr,
2535 		struct brw_reg src0,
2536 		unsigned binding_table_index,
2537 		unsigned sampler,
2538 		unsigned msg_type,
2539 		unsigned response_length,
2540 		unsigned msg_length,
2541 		unsigned header_present,
2542 		unsigned simd_mode,
2543 		unsigned return_format)
2544 {
2545    const struct intel_device_info *devinfo = p->devinfo;
2546    brw_inst *insn;
2547 
2548    if (msg_reg_nr != -1)
2549       gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2550 
2551    insn = next_insn(p, BRW_OPCODE_SEND);
2552    brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2553    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2554 
2555    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2556     *
2557     *    "Instruction compression is not allowed for this instruction (that
2558     *     is, send). The hardware behavior is undefined if this instruction is
2559     *     set as compressed. However, compress control can be set to "SecHalf"
2560     *     to affect the EMask generation."
2561     *
2562     * No similar wording is found in later PRMs, but there are examples
2563     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2564     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2565     * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2566     */
2567    brw_inst_set_compression(devinfo, insn, false);
2568 
2569    if (devinfo->ver < 6)
2570       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2571 
2572    brw_set_dest(p, insn, dest);
2573    brw_set_src0(p, insn, src0);
2574    brw_set_desc(p, insn,
2575                 brw_message_desc(devinfo, msg_length, response_length,
2576                                  header_present) |
2577                 brw_sampler_desc(devinfo, binding_table_index, sampler,
2578                                  msg_type, simd_mode, return_format));
2579 }
2580 
2581 /* Adjust the message header's sampler state pointer to
2582  * select the correct group of 16 samplers.
2583  */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2584 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2585                                       struct brw_reg header,
2586                                       struct brw_reg sampler_index)
2587 {
2588    /* The "Sampler Index" field can only store values between 0 and 15.
2589     * However, we can add an offset to the "Sampler State Pointer"
2590     * field, effectively selecting a different set of 16 samplers.
2591     *
2592     * The "Sampler State Pointer" needs to be aligned to a 32-byte
2593     * offset, and each sampler state is only 16-bytes, so we can't
2594     * exclusively use the offset - we have to use both.
2595     */
2596 
2597    const struct intel_device_info *devinfo = p->devinfo;
2598 
2599    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2600       const int sampler_state_size = 16; /* 16 bytes */
2601       uint32_t sampler = sampler_index.ud;
2602 
2603       if (sampler >= 16) {
2604          assert(devinfo->verx10 >= 75);
2605          brw_ADD(p,
2606                  get_element_ud(header, 3),
2607                  get_element_ud(brw_vec8_grf(0, 0), 3),
2608                  brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2609       }
2610    } else {
2611       /* Non-const sampler array indexing case */
2612       if (devinfo->verx10 <= 70) {
2613          return;
2614       }
2615 
2616       struct brw_reg temp = get_element_ud(header, 3);
2617 
2618       brw_push_insn_state(p);
2619       brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2620       brw_set_default_swsb(p, tgl_swsb_regdist(1));
2621       brw_SHL(p, temp, temp, brw_imm_ud(4));
2622       brw_ADD(p,
2623               get_element_ud(header, 3),
2624               get_element_ud(brw_vec8_grf(0, 0), 3),
2625               temp);
2626       brw_pop_insn_state(p);
2627    }
2628 }
2629 
2630 /* All these variables are pretty confusing - we might be better off
2631  * using bitmasks and macros for this, in the old style.  Or perhaps
2632  * just having the caller instantiate the fields in dword3 itself.
2633  */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2634 void brw_urb_WRITE(struct brw_codegen *p,
2635 		   struct brw_reg dest,
2636 		   unsigned msg_reg_nr,
2637 		   struct brw_reg src0,
2638                    enum brw_urb_write_flags flags,
2639 		   unsigned msg_length,
2640 		   unsigned response_length,
2641 		   unsigned offset,
2642 		   unsigned swizzle)
2643 {
2644    const struct intel_device_info *devinfo = p->devinfo;
2645    brw_inst *insn;
2646 
2647    gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2648 
2649    if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2650       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2651       brw_push_insn_state(p);
2652       brw_set_default_access_mode(p, BRW_ALIGN_1);
2653       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2654       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2655       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2656 		       BRW_REGISTER_TYPE_UD),
2657 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2658 		brw_imm_ud(0xff00));
2659       brw_pop_insn_state(p);
2660    }
2661 
2662    insn = next_insn(p, BRW_OPCODE_SEND);
2663 
2664    assert(msg_length < BRW_MAX_MRF(devinfo->ver));
2665 
2666    brw_set_dest(p, insn, dest);
2667    brw_set_src0(p, insn, src0);
2668    brw_set_src1(p, insn, brw_imm_d(0));
2669 
2670    if (devinfo->ver < 6)
2671       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2672 
2673    brw_set_urb_message(p,
2674 		       insn,
2675 		       flags,
2676 		       msg_length,
2677 		       response_length,
2678 		       offset,
2679 		       swizzle);
2680 }
2681 
2682 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)2683 brw_send_indirect_message(struct brw_codegen *p,
2684                           unsigned sfid,
2685                           struct brw_reg dst,
2686                           struct brw_reg payload,
2687                           struct brw_reg desc,
2688                           unsigned desc_imm,
2689                           bool eot)
2690 {
2691    const struct intel_device_info *devinfo = p->devinfo;
2692    struct brw_inst *send;
2693 
2694    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2695 
2696    assert(desc.type == BRW_REGISTER_TYPE_UD);
2697 
2698    if (desc.file == BRW_IMMEDIATE_VALUE) {
2699       send = next_insn(p, BRW_OPCODE_SEND);
2700       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2701       brw_set_desc(p, send, desc.ud | desc_imm);
2702    } else {
2703       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2704       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2705 
2706       brw_push_insn_state(p);
2707       brw_set_default_access_mode(p, BRW_ALIGN_1);
2708       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2709       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2710       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2711       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2712 
2713       /* Load the indirect descriptor to an address register using OR so the
2714        * caller can specify additional descriptor bits with the desc_imm
2715        * immediate.
2716        */
2717       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2718 
2719       brw_pop_insn_state(p);
2720 
2721       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2722       send = next_insn(p, BRW_OPCODE_SEND);
2723       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2724 
2725       if (devinfo->ver >= 12)
2726          brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2727       else
2728          brw_set_src1(p, send, addr);
2729    }
2730 
2731    brw_set_dest(p, send, dst);
2732    brw_inst_set_sfid(devinfo, send, sfid);
2733    brw_inst_set_eot(devinfo, send, eot);
2734 }
2735 
2736 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool eot)2737 brw_send_indirect_split_message(struct brw_codegen *p,
2738                                 unsigned sfid,
2739                                 struct brw_reg dst,
2740                                 struct brw_reg payload0,
2741                                 struct brw_reg payload1,
2742                                 struct brw_reg desc,
2743                                 unsigned desc_imm,
2744                                 struct brw_reg ex_desc,
2745                                 unsigned ex_desc_imm,
2746                                 bool eot)
2747 {
2748    const struct intel_device_info *devinfo = p->devinfo;
2749    struct brw_inst *send;
2750 
2751    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2752 
2753    assert(desc.type == BRW_REGISTER_TYPE_UD);
2754 
2755    if (desc.file == BRW_IMMEDIATE_VALUE) {
2756       desc.ud |= desc_imm;
2757    } else {
2758       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2759       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2760 
2761       brw_push_insn_state(p);
2762       brw_set_default_access_mode(p, BRW_ALIGN_1);
2763       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2764       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2765       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2766       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2767 
2768       /* Load the indirect descriptor to an address register using OR so the
2769        * caller can specify additional descriptor bits with the desc_imm
2770        * immediate.
2771        */
2772       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2773 
2774       brw_pop_insn_state(p);
2775       desc = addr;
2776 
2777       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2778    }
2779 
2780    if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2781        (devinfo->ver >= 12 ||
2782         ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2783       ex_desc.ud |= ex_desc_imm;
2784    } else {
2785       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2786       struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2787 
2788       brw_push_insn_state(p);
2789       brw_set_default_access_mode(p, BRW_ALIGN_1);
2790       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2791       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2792       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2793       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2794 
2795       /* Load the indirect extended descriptor to an address register using OR
2796        * so the caller can specify additional descriptor bits with the
2797        * desc_imm immediate.
2798        *
2799        * Even though the instruction dispatcher always pulls the SFID and EOT
2800        * fields from the instruction itself, actual external unit which
2801        * processes the message gets the SFID and EOT from the extended
2802        * descriptor which comes from the address register.  If we don't OR
2803        * those two bits in, the external unit may get confused and hang.
2804        */
2805       unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2806 
2807       if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2808          /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2809           * to Gfx12, so we may have fallen back to an indirect extended
2810           * descriptor.
2811           */
2812          brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2813       } else {
2814          brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2815       }
2816 
2817       brw_pop_insn_state(p);
2818       ex_desc = addr;
2819 
2820       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2821    }
2822 
2823    send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2824    brw_set_dest(p, send, dst);
2825    brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2826    brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2827 
2828    if (desc.file == BRW_IMMEDIATE_VALUE) {
2829       brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2830       brw_inst_set_send_desc(devinfo, send, desc.ud);
2831    } else {
2832       assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2833       assert(desc.nr == BRW_ARF_ADDRESS);
2834       assert(desc.subnr == 0);
2835       brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2836    }
2837 
2838    if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2839       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2840       brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2841    } else {
2842       assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2843       assert(ex_desc.nr == BRW_ARF_ADDRESS);
2844       assert((ex_desc.subnr & 0x3) == 0);
2845       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2846       brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2847    }
2848 
2849    brw_inst_set_sfid(devinfo, send, sfid);
2850    brw_inst_set_eot(devinfo, send, eot);
2851 }
2852 
2853 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)2854 brw_send_indirect_surface_message(struct brw_codegen *p,
2855                                   unsigned sfid,
2856                                   struct brw_reg dst,
2857                                   struct brw_reg payload,
2858                                   struct brw_reg surface,
2859                                   unsigned desc_imm)
2860 {
2861    if (surface.file != BRW_IMMEDIATE_VALUE) {
2862       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2863       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2864 
2865       brw_push_insn_state(p);
2866       brw_set_default_access_mode(p, BRW_ALIGN_1);
2867       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2868       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2869       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2870       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2871 
2872       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2873        * some surface array is accessed out of bounds.
2874        */
2875       brw_AND(p, addr,
2876               suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2877                         BRW_GET_SWZ(surface.swizzle, 0)),
2878               brw_imm_ud(0xff));
2879 
2880       brw_pop_insn_state(p);
2881 
2882       surface = addr;
2883       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2884    }
2885 
2886    brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2887 }
2888 
2889 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2890 while_jumps_before_offset(const struct intel_device_info *devinfo,
2891                           brw_inst *insn, int while_offset, int start_offset)
2892 {
2893    int scale = 16 / brw_jump_scale(devinfo);
2894    int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)
2895                                : brw_inst_jip(devinfo, insn);
2896    assert(jip < 0);
2897    return while_offset + jip * scale <= start_offset;
2898 }
2899 
2900 
2901 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2902 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2903 {
2904    int offset;
2905    void *store = p->store;
2906    const struct intel_device_info *devinfo = p->devinfo;
2907 
2908    int depth = 0;
2909 
2910    for (offset = next_offset(devinfo, store, start_offset);
2911         offset < p->next_insn_offset;
2912         offset = next_offset(devinfo, store, offset)) {
2913       brw_inst *insn = store + offset;
2914 
2915       switch (brw_inst_opcode(devinfo, insn)) {
2916       case BRW_OPCODE_IF:
2917          depth++;
2918          break;
2919       case BRW_OPCODE_ENDIF:
2920          if (depth == 0)
2921             return offset;
2922          depth--;
2923          break;
2924       case BRW_OPCODE_WHILE:
2925          /* If the while doesn't jump before our instruction, it's the end
2926           * of a sibling do...while loop.  Ignore it.
2927           */
2928          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2929             continue;
2930          FALLTHROUGH;
2931       case BRW_OPCODE_ELSE:
2932       case BRW_OPCODE_HALT:
2933          if (depth == 0)
2934             return offset;
2935          break;
2936       default:
2937          break;
2938       }
2939    }
2940 
2941    return 0;
2942 }
2943 
2944 /* There is no DO instruction on gfx6, so to find the end of the loop
2945  * we have to see if the loop is jumping back before our start
2946  * instruction.
2947  */
2948 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2949 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2950 {
2951    const struct intel_device_info *devinfo = p->devinfo;
2952    int offset;
2953    void *store = p->store;
2954 
2955    assert(devinfo->ver >= 6);
2956 
2957    /* Always start after the instruction (such as a WHILE) we're trying to fix
2958     * up.
2959     */
2960    for (offset = next_offset(devinfo, store, start_offset);
2961         offset < p->next_insn_offset;
2962         offset = next_offset(devinfo, store, offset)) {
2963       brw_inst *insn = store + offset;
2964 
2965       if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2966 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2967 	    return offset;
2968       }
2969    }
2970    assert(!"not reached");
2971    return start_offset;
2972 }
2973 
2974 /* After program generation, go back and update the UIP and JIP of
2975  * BREAK, CONT, and HALT instructions to their correct locations.
2976  */
2977 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2978 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2979 {
2980    const struct intel_device_info *devinfo = p->devinfo;
2981    int offset;
2982    int br = brw_jump_scale(devinfo);
2983    int scale = 16 / br;
2984    void *store = p->store;
2985 
2986    if (devinfo->ver < 6)
2987       return;
2988 
2989    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2990       brw_inst *insn = store + offset;
2991       assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2992 
2993       int block_end_offset = brw_find_next_block_end(p, offset);
2994       switch (brw_inst_opcode(devinfo, insn)) {
2995       case BRW_OPCODE_BREAK:
2996          assert(block_end_offset != 0);
2997          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2998 	 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
2999          brw_inst_set_uip(devinfo, insn,
3000 	    (brw_find_loop_end(p, offset) - offset +
3001              (devinfo->ver == 6 ? 16 : 0)) / scale);
3002 	 break;
3003       case BRW_OPCODE_CONTINUE:
3004          assert(block_end_offset != 0);
3005          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3006          brw_inst_set_uip(devinfo, insn,
3007             (brw_find_loop_end(p, offset) - offset) / scale);
3008 
3009          assert(brw_inst_uip(devinfo, insn) != 0);
3010          assert(brw_inst_jip(devinfo, insn) != 0);
3011 	 break;
3012 
3013       case BRW_OPCODE_ENDIF: {
3014          int32_t jump = (block_end_offset == 0) ?
3015                         1 * br : (block_end_offset - offset) / scale;
3016          if (devinfo->ver >= 7)
3017             brw_inst_set_jip(devinfo, insn, jump);
3018          else
3019             brw_inst_set_gfx6_jump_count(devinfo, insn, jump);
3020 	 break;
3021       }
3022 
3023       case BRW_OPCODE_HALT:
3024 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3025 	  *
3026 	  *    "In case of the halt instruction not inside any conditional
3027 	  *     code block, the value of <JIP> and <UIP> should be the
3028 	  *     same. In case of the halt instruction inside conditional code
3029 	  *     block, the <UIP> should be the end of the program, and the
3030 	  *     <JIP> should be end of the most inner conditional code block."
3031 	  *
3032 	  * The uip will have already been set by whoever set up the
3033 	  * instruction.
3034 	  */
3035 	 if (block_end_offset == 0) {
3036             brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
3037 	 } else {
3038             brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3039 	 }
3040          assert(brw_inst_uip(devinfo, insn) != 0);
3041          assert(brw_inst_jip(devinfo, insn) != 0);
3042 	 break;
3043 
3044       default:
3045          break;
3046       }
3047    }
3048 }
3049 
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)3050 void brw_ff_sync(struct brw_codegen *p,
3051 		   struct brw_reg dest,
3052 		   unsigned msg_reg_nr,
3053 		   struct brw_reg src0,
3054 		   bool allocate,
3055 		   unsigned response_length,
3056 		   bool eot)
3057 {
3058    const struct intel_device_info *devinfo = p->devinfo;
3059    brw_inst *insn;
3060 
3061    gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3062 
3063    insn = next_insn(p, BRW_OPCODE_SEND);
3064    brw_set_dest(p, insn, dest);
3065    brw_set_src0(p, insn, src0);
3066    brw_set_src1(p, insn, brw_imm_d(0));
3067 
3068    if (devinfo->ver < 6)
3069       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3070 
3071    brw_set_ff_sync_message(p,
3072 			   insn,
3073 			   allocate,
3074 			   response_length,
3075 			   eot);
3076 }
3077 
3078 /**
3079  * Emit the SEND instruction necessary to generate stream output data on Gfx6
3080  * (for transform feedback).
3081  *
3082  * If send_commit_msg is true, this is the last piece of stream output data
3083  * from this thread, so send the data as a committed write.  According to the
3084  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3085  *
3086  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3087  *   writes are complete by sending the final write as a committed write."
3088  */
3089 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)3090 brw_svb_write(struct brw_codegen *p,
3091               struct brw_reg dest,
3092               unsigned msg_reg_nr,
3093               struct brw_reg src0,
3094               unsigned binding_table_index,
3095               bool   send_commit_msg)
3096 {
3097    const struct intel_device_info *devinfo = p->devinfo;
3098    assert(devinfo->ver == 6);
3099    const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3100    brw_inst *insn;
3101 
3102    gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3103 
3104    insn = next_insn(p, BRW_OPCODE_SEND);
3105    brw_inst_set_sfid(devinfo, insn, target_cache);
3106    brw_set_dest(p, insn, dest);
3107    brw_set_src0(p, insn, src0);
3108    brw_set_desc(p, insn,
3109                 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3110                 brw_dp_write_desc(devinfo, binding_table_index,
3111                                   0, /* msg_control: ignored */
3112                                   GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3113                                   send_commit_msg)); /* send_commit_msg */
3114 }
3115 
3116 static unsigned
brw_surface_payload_size(unsigned num_channels,unsigned exec_size)3117 brw_surface_payload_size(unsigned num_channels,
3118                          unsigned exec_size /**< 0 for SIMD4x2 */)
3119 {
3120    if (exec_size == 0)
3121       return 1; /* SIMD4x2 */
3122    else if (exec_size <= 8)
3123       return num_channels;
3124    else
3125       return 2 * num_channels;
3126 }
3127 
3128 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3129 brw_untyped_atomic(struct brw_codegen *p,
3130                    struct brw_reg dst,
3131                    struct brw_reg payload,
3132                    struct brw_reg surface,
3133                    unsigned atomic_op,
3134                    unsigned msg_length,
3135                    bool response_expected,
3136                    bool header_present)
3137 {
3138    const struct intel_device_info *devinfo = p->devinfo;
3139    const unsigned sfid = (devinfo->verx10 >= 75 ?
3140                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3141                           GFX7_SFID_DATAPORT_DATA_CACHE);
3142    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3143    /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3144    const bool has_simd4x2 = devinfo->verx10 >= 75;
3145    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3146                               has_simd4x2 ? 0 : 8;
3147    const unsigned response_length =
3148       brw_surface_payload_size(response_expected, exec_size);
3149    const unsigned desc =
3150       brw_message_desc(devinfo, msg_length, response_length, header_present) |
3151       brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3152                                  response_expected);
3153    /* Mask out unused components -- This is especially important in Align16
3154     * mode on generations that don't have native support for SIMD4x2 atomics,
3155     * because unused but enabled components will cause the dataport to perform
3156     * additional atomic operations on the addresses that happen to be in the
3157     * uninitialized Y, Z and W coordinates of the payload.
3158     */
3159    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3160 
3161    brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3162                                      payload, surface, desc);
3163 }
3164 
3165 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3166 brw_untyped_surface_read(struct brw_codegen *p,
3167                          struct brw_reg dst,
3168                          struct brw_reg payload,
3169                          struct brw_reg surface,
3170                          unsigned msg_length,
3171                          unsigned num_channels)
3172 {
3173    const struct intel_device_info *devinfo = p->devinfo;
3174    const unsigned sfid = (devinfo->verx10 >= 75 ?
3175                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3176                           GFX7_SFID_DATAPORT_DATA_CACHE);
3177    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3178    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3179    const unsigned response_length =
3180       brw_surface_payload_size(num_channels, exec_size);
3181    const unsigned desc =
3182       brw_message_desc(devinfo, msg_length, response_length, false) |
3183       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3184 
3185    brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3186 }
3187 
3188 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3189 brw_untyped_surface_write(struct brw_codegen *p,
3190                           struct brw_reg payload,
3191                           struct brw_reg surface,
3192                           unsigned msg_length,
3193                           unsigned num_channels,
3194                           bool header_present)
3195 {
3196    const struct intel_device_info *devinfo = p->devinfo;
3197    const unsigned sfid = (devinfo->verx10 >= 75 ?
3198                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3199                           GFX7_SFID_DATAPORT_DATA_CACHE);
3200    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3201    /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3202    const bool has_simd4x2 = devinfo->verx10 >= 75;
3203    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3204                               has_simd4x2 ? 0 : 8;
3205    const unsigned desc =
3206       brw_message_desc(devinfo, msg_length, 0, header_present) |
3207       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3208    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3209    const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3210 
3211    brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3212                                      payload, surface, desc);
3213 }
3214 
3215 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)3216 brw_set_memory_fence_message(struct brw_codegen *p,
3217                              struct brw_inst *insn,
3218                              enum brw_message_target sfid,
3219                              bool commit_enable,
3220                              unsigned bti)
3221 {
3222    const struct intel_device_info *devinfo = p->devinfo;
3223 
3224    brw_set_desc(p, insn, brw_message_desc(
3225                    devinfo, 1, (commit_enable ? 1 : 0), true));
3226 
3227    brw_inst_set_sfid(devinfo, insn, sfid);
3228 
3229    switch (sfid) {
3230    case GFX6_SFID_DATAPORT_RENDER_CACHE:
3231       brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3232       break;
3233    case GFX7_SFID_DATAPORT_DATA_CACHE:
3234       brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3235       break;
3236    default:
3237       unreachable("Not reached");
3238    }
3239 
3240    if (commit_enable)
3241       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3242 
3243    assert(devinfo->ver >= 11 || bti == 0);
3244    brw_inst_set_binding_table_index(devinfo, insn, bti);
3245 }
3246 
3247 static void
gfx12_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid)3248 gfx12_set_memory_fence_message(struct brw_codegen *p,
3249                                struct brw_inst *insn,
3250                                enum brw_message_target sfid)
3251 {
3252    const unsigned mlen = 1; /* g0 header */
3253     /* Completion signaled by write to register. No data returned. */
3254    const unsigned rlen = 1;
3255 
3256    brw_inst_set_sfid(p->devinfo, insn, sfid);
3257 
3258    if (sfid == BRW_SFID_URB) {
3259       brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
3260                             brw_message_desc(p->devinfo, mlen, rlen, false));
3261    } else {
3262       enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP;
3263       enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
3264 
3265       if (sfid == GFX12_SFID_TGM) {
3266          scope = LSC_FENCE_TILE;
3267          flush_type = LSC_FLUSH_TYPE_EVICT;
3268       }
3269 
3270       brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3271                                                flush_type, false) |
3272                             brw_message_desc(p->devinfo, mlen, rlen, false));
3273    }
3274 }
3275 
3276 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,bool commit_enable,unsigned bti)3277 brw_memory_fence(struct brw_codegen *p,
3278                  struct brw_reg dst,
3279                  struct brw_reg src,
3280                  enum opcode send_op,
3281                  enum brw_message_target sfid,
3282                  bool commit_enable,
3283                  unsigned bti)
3284 {
3285    const struct intel_device_info *devinfo = p->devinfo;
3286 
3287    dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3288    src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3289 
3290    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3291     * message doesn't write anything back.
3292     */
3293    struct brw_inst *insn = next_insn(p, send_op);
3294    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3295    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3296    brw_set_dest(p, insn, dst);
3297    brw_set_src0(p, insn, src);
3298 
3299    /* All DG2 hardware requires LSC for fence messages, even A-step */
3300    if (devinfo->has_lsc)
3301       gfx12_set_memory_fence_message(p, insn, sfid);
3302    else
3303       brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3304 }
3305 
3306 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,bool coarse_pixel_rate,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3307 brw_pixel_interpolator_query(struct brw_codegen *p,
3308                              struct brw_reg dest,
3309                              struct brw_reg mrf,
3310                              bool noperspective,
3311                              bool coarse_pixel_rate,
3312                              unsigned mode,
3313                              struct brw_reg data,
3314                              unsigned msg_length,
3315                              unsigned response_length)
3316 {
3317    const struct intel_device_info *devinfo = p->devinfo;
3318    const uint16_t exec_size = brw_get_default_exec_size(p);
3319    const unsigned slot_group = brw_get_default_group(p) / 16;
3320    const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3321    const unsigned desc =
3322       brw_message_desc(devinfo, msg_length, response_length, false) |
3323       brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate,
3324                             simd_mode, slot_group);
3325 
3326    /* brw_send_indirect_message will automatically use a direct send message
3327     * if data is actually immediate.
3328     */
3329    brw_send_indirect_message(p,
3330                              GFX7_SFID_PIXEL_INTERPOLATOR,
3331                              dest,
3332                              mrf,
3333                              vec1(data),
3334                              desc,
3335                              false);
3336 }
3337 
3338 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,struct brw_reg mask)3339 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3340                       struct brw_reg mask)
3341 {
3342    const struct intel_device_info *devinfo = p->devinfo;
3343    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3344    const unsigned qtr_control = brw_get_default_group(p) / 8;
3345    brw_inst *inst;
3346 
3347    assert(devinfo->ver >= 7);
3348    assert(mask.type == BRW_REGISTER_TYPE_UD);
3349 
3350    brw_push_insn_state(p);
3351 
3352    /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3353     * unnecessary bits in the instruction words, get the information we need
3354     * and reset the default flag register. This allows more instructions to be
3355     * compacted.
3356     */
3357    const unsigned flag_subreg = p->current->flag_subreg;
3358    brw_set_default_flag_reg(p, 0, 0);
3359 
3360    if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3361       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3362 
3363       if (devinfo->ver >= 8) {
3364          /* Getting the first active channel index is easy on Gfx8: Just find
3365           * the first bit set in the execution mask.  The register exists on
3366           * HSW already but it reads back as all ones when the current
3367           * instruction has execution masking disabled, so it's kind of
3368           * useless.
3369           */
3370          struct brw_reg exec_mask =
3371             retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3372 
3373          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3374          if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3375             /* Unfortunately, ce0 does not take into account the thread
3376              * dispatch mask, which may be a problem in cases where it's not
3377              * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3378              * some n).  Combine ce0 with the given dispatch (or vector) mask
3379              * to mask off those channels which were never dispatched by the
3380              * hardware.
3381              */
3382             brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3383             brw_set_default_swsb(p, tgl_swsb_regdist(1));
3384             brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3385             exec_mask = vec1(dst);
3386          }
3387 
3388          /* Quarter control has the effect of magically shifting the value of
3389           * ce0 so you'll get the first active channel relative to the
3390           * specified quarter control as result.
3391           */
3392          inst = brw_FBL(p, vec1(dst), exec_mask);
3393       } else {
3394          const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3395 
3396          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3397          brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3398 
3399          /* Run enough instructions returning zero with execution masking and
3400           * a conditional modifier enabled in order to get the full execution
3401           * mask in f1.0.  We could use a single 32-wide move here if it
3402           * weren't because of the hardware bug that causes channel enables to
3403           * be applied incorrectly to the second half of 32-wide instructions
3404           * on Gfx7.
3405           */
3406          const unsigned lower_size = MIN2(16, exec_size);
3407          for (unsigned i = 0; i < exec_size / lower_size; i++) {
3408             inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3409                            brw_imm_uw(0));
3410             brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3411             brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3412             brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3413             brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3414             brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3415             brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3416          }
3417 
3418          /* Find the first bit set in the exec_size-wide portion of the flag
3419           * register that was updated by the last sequence of MOV
3420           * instructions.
3421           */
3422          const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3423          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3424          brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3425       }
3426    } else {
3427       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3428 
3429       if (devinfo->ver >= 8 &&
3430           mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3431          /* In SIMD4x2 mode the first active channel index is just the
3432           * negation of the first bit of the mask register.  Note that ce0
3433           * doesn't take into account the dispatch mask, so the Gfx7 path
3434           * should be used instead unless you have the guarantee that the
3435           * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3436           * for some n).
3437           */
3438          inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3439                         negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3440                         brw_imm_ud(1));
3441 
3442       } else {
3443          /* Overwrite the destination without and with execution masking to
3444           * find out which of the channels is active.
3445           */
3446          brw_push_insn_state(p);
3447          brw_set_default_exec_size(p, BRW_EXECUTE_4);
3448          brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3449                  brw_imm_ud(1));
3450 
3451          inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3452                         brw_imm_ud(0));
3453          brw_pop_insn_state(p);
3454          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3455       }
3456    }
3457 
3458    brw_pop_insn_state(p);
3459 }
3460 
3461 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3462 brw_broadcast(struct brw_codegen *p,
3463               struct brw_reg dst,
3464               struct brw_reg src,
3465               struct brw_reg idx)
3466 {
3467    const struct intel_device_info *devinfo = p->devinfo;
3468    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3469    brw_inst *inst;
3470 
3471    brw_push_insn_state(p);
3472    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3473    brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3474 
3475    assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3476           src.address_mode == BRW_ADDRESS_DIRECT);
3477    assert(!src.abs && !src.negate);
3478    assert(src.type == dst.type);
3479 
3480    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3481        idx.file == BRW_IMMEDIATE_VALUE) {
3482       /* Trivial, the source is already uniform or the index is a constant.
3483        * We will typically not get here if the optimizer is doing its job, but
3484        * asserting would be mean.
3485        */
3486       const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3487       src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3488                      stride(suboffset(src, 4 * i), 0, 4, 1);
3489 
3490       if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3491          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3492                     subscript(src, BRW_REGISTER_TYPE_D, 0));
3493          brw_set_default_swsb(p, tgl_swsb_null());
3494          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3495                     subscript(src, BRW_REGISTER_TYPE_D, 1));
3496       } else {
3497          brw_MOV(p, dst, src);
3498       }
3499    } else {
3500       /* From the Haswell PRM section "Register Region Restrictions":
3501        *
3502        *    "The lower bits of the AddressImmediate must not overflow to
3503        *    change the register address.  The lower 5 bits of Address
3504        *    Immediate when added to lower 5 bits of address register gives
3505        *    the sub-register offset. The upper bits of Address Immediate
3506        *    when added to upper bits of address register gives the register
3507        *    address. Any overflow from sub-register offset is dropped."
3508        *
3509        * Fortunately, for broadcast, we never have a sub-register offset so
3510        * this isn't an issue.
3511        */
3512       assert(src.subnr == 0);
3513 
3514       if (align1) {
3515          const struct brw_reg addr =
3516             retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3517          unsigned offset = src.nr * REG_SIZE + src.subnr;
3518          /* Limit in bytes of the signed indirect addressing immediate. */
3519          const unsigned limit = 512;
3520 
3521          brw_push_insn_state(p);
3522          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3523          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3524 
3525          /* Take into account the component size and horizontal stride. */
3526          assert(src.vstride == src.hstride + src.width);
3527          brw_SHL(p, addr, vec1(idx),
3528                  brw_imm_ud(util_logbase2(type_sz(src.type)) +
3529                             src.hstride - 1));
3530 
3531          /* We can only address up to limit bytes using the indirect
3532           * addressing immediate, account for the difference if the source
3533           * register is above this limit.
3534           */
3535          if (offset >= limit) {
3536             brw_set_default_swsb(p, tgl_swsb_regdist(1));
3537             brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3538             offset = offset % limit;
3539          }
3540 
3541          brw_pop_insn_state(p);
3542 
3543          brw_set_default_swsb(p, tgl_swsb_regdist(1));
3544 
3545          /* Use indirect addressing to fetch the specified component. */
3546          if (type_sz(src.type) > 4 &&
3547              (devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
3548               !devinfo->has_64bit_float)) {
3549             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3550              *
3551              *    "When source or destination datatype is 64b or operation is
3552              *    integer DWord multiply, indirect addressing must not be
3553              *    used."
3554              *
3555              * To work around both of this issue, we do two integer MOVs
3556              * insead of one 64-bit MOV.  Because no double value should ever
3557              * cross a register boundary, it's safe to use the immediate
3558              * offset in the indirect here to handle adding 4 bytes to the
3559              * offset and avoid the extra ADD to the register file.
3560              */
3561             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3562                        retype(brw_vec1_indirect(addr.subnr, offset),
3563                               BRW_REGISTER_TYPE_D));
3564             brw_set_default_swsb(p, tgl_swsb_null());
3565             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3566                        retype(brw_vec1_indirect(addr.subnr, offset + 4),
3567                               BRW_REGISTER_TYPE_D));
3568          } else {
3569             brw_MOV(p, dst,
3570                     retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3571          }
3572       } else {
3573          /* In SIMD4x2 mode the index can be either zero or one, replicate it
3574           * to all bits of a flag register,
3575           */
3576          inst = brw_MOV(p,
3577                         brw_null_reg(),
3578                         stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3579          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3580          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3581          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3582 
3583          /* and use predicated SEL to pick the right channel. */
3584          inst = brw_SEL(p, dst,
3585                         stride(suboffset(src, 4), 4, 4, 1),
3586                         stride(src, 4, 4, 1));
3587          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3588          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3589       }
3590    }
3591 
3592    brw_pop_insn_state(p);
3593 }
3594 
3595 /**
3596  * This instruction is generated as a single-channel align1 instruction by
3597  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3598  *
3599  * We can't use the typed atomic op in the FS because that has the execution
3600  * mask ANDed with the pixel mask, but we just want to write the one dword for
3601  * all the pixels.
3602  *
3603  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3604  * one u32.  So we use the same untyped atomic write message as the pixel
3605  * shader.
3606  *
3607  * The untyped atomic operation requires a BUFFER surface type with RAW
3608  * format, and is only accessible through the legacy DATA_CACHE dataport
3609  * messages.
3610  */
brw_shader_time_add(struct brw_codegen * p,struct brw_reg payload,uint32_t surf_index)3611 void brw_shader_time_add(struct brw_codegen *p,
3612                          struct brw_reg payload,
3613                          uint32_t surf_index)
3614 {
3615    const struct intel_device_info *devinfo = p->devinfo;
3616    const unsigned sfid = (devinfo->verx10 >= 75 ?
3617                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3618                           GFX7_SFID_DATAPORT_DATA_CACHE);
3619    assert(devinfo->ver >= 7);
3620 
3621    brw_push_insn_state(p);
3622    brw_set_default_access_mode(p, BRW_ALIGN_1);
3623    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3624    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3625    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3626 
3627    /* We use brw_vec1_reg and unmasked because we want to increment the given
3628     * offset only once.
3629     */
3630    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3631                                       BRW_ARF_NULL, 0));
3632    brw_set_src0(p, send, brw_vec1_reg(payload.file,
3633                                       payload.nr, 0));
3634    brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3635                           brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3636                                                      false)));
3637 
3638    brw_inst_set_sfid(devinfo, send, sfid);
3639    brw_inst_set_binding_table_index(devinfo, send, surf_index);
3640 
3641    brw_pop_insn_state(p);
3642 }
3643 
3644 
3645 /**
3646  * Emit the SEND message for a barrier
3647  */
3648 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3649 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3650 {
3651    const struct intel_device_info *devinfo = p->devinfo;
3652    struct brw_inst *inst;
3653 
3654    assert(devinfo->ver >= 7);
3655 
3656    brw_push_insn_state(p);
3657    brw_set_default_access_mode(p, BRW_ALIGN_1);
3658    inst = next_insn(p, BRW_OPCODE_SEND);
3659    brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3660    brw_set_src0(p, inst, src);
3661    brw_set_src1(p, inst, brw_null_reg());
3662    brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3663 
3664    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3665    brw_inst_set_gateway_subfuncid(devinfo, inst,
3666                                   BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3667 
3668    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3669    brw_pop_insn_state(p);
3670 }
3671 
3672 
3673 /**
3674  * Emit the wait instruction for a barrier
3675  */
3676 void
brw_WAIT(struct brw_codegen * p)3677 brw_WAIT(struct brw_codegen *p)
3678 {
3679    const struct intel_device_info *devinfo = p->devinfo;
3680    struct brw_inst *insn;
3681 
3682    struct brw_reg src = brw_notification_reg();
3683 
3684    insn = next_insn(p, BRW_OPCODE_WAIT);
3685    brw_set_dest(p, insn, src);
3686    brw_set_src0(p, insn, src);
3687    brw_set_src1(p, insn, brw_null_reg());
3688 
3689    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3690    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3691 }
3692 
3693 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)3694 brw_float_controls_mode(struct brw_codegen *p,
3695                         unsigned mode, unsigned mask)
3696 {
3697    /* From the Skylake PRM, Volume 7, page 760:
3698     *  "Implementation Restriction on Register Access: When the control
3699     *   register is used as an explicit source and/or destination, hardware
3700     *   does not ensure execution pipeline coherency. Software must set the
3701     *   thread control field to ‘switch’ for an instruction that uses
3702     *   control register as an explicit operand."
3703     *
3704     * On Gfx12+ this is implemented in terms of SWSB annotations instead.
3705     */
3706    brw_set_default_swsb(p, tgl_swsb_regdist(1));
3707 
3708    brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3709                             brw_imm_ud(~mask));
3710    brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3711    if (p->devinfo->ver < 12)
3712       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3713 
3714    if (mode) {
3715       brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3716                                  brw_imm_ud(mode));
3717       brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3718       if (p->devinfo->ver < 12)
3719          brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3720    }
3721 
3722    if (p->devinfo->ver >= 12)
3723       brw_SYNC(p, TGL_SYNC_NOP);
3724 }
3725 
3726 void
brw_update_reloc_imm(const struct intel_device_info * devinfo,brw_inst * inst,uint32_t value)3727 brw_update_reloc_imm(const struct intel_device_info *devinfo,
3728                      brw_inst *inst,
3729                      uint32_t value)
3730 {
3731    /* Sanity check that the instruction is a MOV of an immediate */
3732    assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV);
3733    assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3734 
3735    /* If it was compacted, we can't safely rewrite */
3736    assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3737 
3738    brw_inst_set_imm_ud(devinfo, inst, value);
3739 }
3740 
3741 /* A default value for constants that will be patched at run-time.
3742  * We pick an arbitrary value that prevents instruction compaction.
3743  */
3744 #define DEFAULT_PATCH_IMM 0x4a7cc037
3745 
3746 void
brw_MOV_reloc_imm(struct brw_codegen * p,struct brw_reg dst,enum brw_reg_type src_type,uint32_t id)3747 brw_MOV_reloc_imm(struct brw_codegen *p,
3748                   struct brw_reg dst,
3749                   enum brw_reg_type src_type,
3750                   uint32_t id)
3751 {
3752    assert(type_sz(src_type) == 4);
3753    assert(type_sz(dst.type) == 4);
3754 
3755    brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
3756                  p->next_insn_offset, 0);
3757 
3758    brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3759 }
3760