1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_combine_constants.cpp
25  *
26  * This file contains the opt_combine_constants() pass that runs after the
27  * regular optimization loop. It passes over the instruction list and
28  * selectively promotes immediate values to registers by emitting a mov(1)
29  * instruction.
30  *
31  * This is useful on Gen 7 particularly, because a few instructions can be
32  * coissued (i.e., issued in the same cycle as another thread on the same EU
33  * issues an instruction) under some circumstances, one of which is that they
34  * cannot use immediate values.
35  */
36 
37 #include "brw_fs.h"
38 #include "brw_cfg.h"
39 #include "util/half_float.h"
40 
41 using namespace brw;
42 
43 static const bool debug = false;
44 
45 /* Returns whether an instruction could co-issue if its immediate source were
46  * replaced with a GRF source.
47  */
48 static bool
could_coissue(const struct gen_device_info * devinfo,const fs_inst * inst)49 could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst)
50 {
51    if (devinfo->gen != 7)
52       return false;
53 
54    switch (inst->opcode) {
55    case BRW_OPCODE_MOV:
56    case BRW_OPCODE_CMP:
57    case BRW_OPCODE_ADD:
58    case BRW_OPCODE_MUL:
59       /* Only float instructions can coissue.  We don't have a great
60        * understanding of whether or not something like float(int(a) + int(b))
61        * would be considered float (based on the destination type) or integer
62        * (based on the source types), so we take the conservative choice of
63        * only promoting when both destination and source are float.
64        */
65       return inst->dst.type == BRW_REGISTER_TYPE_F &&
66              inst->src[0].type == BRW_REGISTER_TYPE_F;
67    default:
68       return false;
69    }
70 }
71 
72 /**
73  * Returns true for instructions that don't support immediate sources.
74  */
75 static bool
must_promote_imm(const struct gen_device_info * devinfo,const fs_inst * inst)76 must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
77 {
78    switch (inst->opcode) {
79    case SHADER_OPCODE_POW:
80       return devinfo->gen < 8;
81    case BRW_OPCODE_MAD:
82    case BRW_OPCODE_LRP:
83       return true;
84    default:
85       return false;
86    }
87 }
88 
89 /** A box for putting fs_regs in a linked list. */
90 struct reg_link {
91    DECLARE_RALLOC_CXX_OPERATORS(reg_link)
92 
reg_linkreg_link93    reg_link(fs_reg *reg) : reg(reg) {}
94 
95    struct exec_node link;
96    fs_reg *reg;
97 };
98 
99 static struct exec_node *
link(void * mem_ctx,fs_reg * reg)100 link(void *mem_ctx, fs_reg *reg)
101 {
102    reg_link *l = new(mem_ctx) reg_link(reg);
103    return &l->link;
104 }
105 
106 /**
107  * Information about an immediate value.
108  */
109 struct imm {
110    /** The common ancestor of all blocks using this immediate value. */
111    bblock_t *block;
112 
113    /**
114     * The instruction generating the immediate value, if all uses are contained
115     * within a single basic block. Otherwise, NULL.
116     */
117    fs_inst *inst;
118 
119    /**
120     * A list of fs_regs that refer to this immediate.  If we promote it, we'll
121     * have to patch these up to refer to the new GRF.
122     */
123    exec_list *uses;
124 
125    /** The immediate value */
126    union {
127       char bytes[8];
128       double df;
129       int64_t d64;
130       float f;
131       int32_t d;
132       int16_t w;
133    };
134    uint8_t size;
135 
136    /** When promoting half-float we need to account for certain restrictions */
137    bool is_half_float;
138 
139    /**
140     * The GRF register and subregister number where we've decided to store the
141     * constant value.
142     */
143    uint8_t subreg_offset;
144    uint16_t nr;
145 
146    /** The number of coissuable instructions using this immediate. */
147    uint16_t uses_by_coissue;
148 
149    /**
150     * Whether this constant is used by an instruction that can't handle an
151     * immediate source (and already has to be promoted to a GRF).
152     */
153    bool must_promote;
154 
155    uint16_t first_use_ip;
156    uint16_t last_use_ip;
157 };
158 
159 /** The working set of information about immediates. */
160 struct table {
161    struct imm *imm;
162    int size;
163    int len;
164 };
165 
166 static struct imm *
find_imm(struct table * table,void * data,uint8_t size)167 find_imm(struct table *table, void *data, uint8_t size)
168 {
169    for (int i = 0; i < table->len; i++) {
170       if (table->imm[i].size == size &&
171           !memcmp(table->imm[i].bytes, data, size)) {
172          return &table->imm[i];
173       }
174    }
175    return NULL;
176 }
177 
178 static struct imm *
new_imm(struct table * table,void * mem_ctx)179 new_imm(struct table *table, void *mem_ctx)
180 {
181    if (table->len == table->size) {
182       table->size *= 2;
183       table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
184    }
185    return &table->imm[table->len++];
186 }
187 
188 /**
189  * Comparator used for sorting an array of imm structures.
190  *
191  * We sort by basic block number, then last use IP, then first use IP (least
192  * to greatest). This sorting causes immediates live in the same area to be
193  * allocated to the same register in the hopes that all values will be dead
194  * about the same time and the register can be reused.
195  */
196 static int
compare(const void * _a,const void * _b)197 compare(const void *_a, const void *_b)
198 {
199    const struct imm *a = (const struct imm *)_a,
200                     *b = (const struct imm *)_b;
201 
202    int block_diff = a->block->num - b->block->num;
203    if (block_diff)
204       return block_diff;
205 
206    int end_diff = a->last_use_ip - b->last_use_ip;
207    if (end_diff)
208       return end_diff;
209 
210    return a->first_use_ip - b->first_use_ip;
211 }
212 
213 static bool
get_constant_value(const struct gen_device_info * devinfo,const fs_inst * inst,uint32_t src_idx,void * out,brw_reg_type * out_type)214 get_constant_value(const struct gen_device_info *devinfo,
215                    const fs_inst *inst, uint32_t src_idx,
216                    void *out, brw_reg_type *out_type)
217 {
218    const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
219    const fs_reg *src = &inst->src[src_idx];
220 
221    *out_type = src->type;
222 
223    switch (*out_type) {
224    case BRW_REGISTER_TYPE_DF: {
225       double val = !can_do_source_mods ? src->df : fabs(src->df);
226       memcpy(out, &val, 8);
227       break;
228    }
229    case BRW_REGISTER_TYPE_F: {
230       float val = !can_do_source_mods ? src->f : fabsf(src->f);
231       memcpy(out, &val, 4);
232       break;
233    }
234    case BRW_REGISTER_TYPE_HF: {
235       uint16_t val = src->d & 0xffffu;
236       if (can_do_source_mods)
237          val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
238       memcpy(out, &val, 2);
239       break;
240    }
241    case BRW_REGISTER_TYPE_Q: {
242       int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64);
243       memcpy(out, &val, 8);
244       break;
245    }
246    case BRW_REGISTER_TYPE_UQ:
247       memcpy(out, &src->u64, 8);
248       break;
249    case BRW_REGISTER_TYPE_D: {
250       int32_t val = !can_do_source_mods ? src->d : abs(src->d);
251       memcpy(out, &val, 4);
252       break;
253    }
254    case BRW_REGISTER_TYPE_UD:
255       memcpy(out, &src->ud, 4);
256       break;
257    case BRW_REGISTER_TYPE_W: {
258       int16_t val = src->d & 0xffffu;
259       if (can_do_source_mods)
260          val = abs(val);
261       memcpy(out, &val, 2);
262       break;
263    }
264    case BRW_REGISTER_TYPE_UW:
265       memcpy(out, &src->ud, 2);
266       break;
267    default:
268       return false;
269    };
270 
271    return true;
272 }
273 
274 static struct brw_reg
build_imm_reg_for_copy(struct imm * imm)275 build_imm_reg_for_copy(struct imm *imm)
276 {
277    switch (imm->size) {
278    case 8:
279       return brw_imm_d(imm->d64);
280    case 4:
281       return brw_imm_d(imm->d);
282    case 2:
283       return brw_imm_w(imm->w);
284    default:
285       unreachable("not implemented");
286    }
287 }
288 
289 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)290 get_alignment_for_imm(const struct imm *imm)
291 {
292    if (imm->is_half_float)
293       return 4; /* At least MAD seems to require this */
294    else
295       return imm->size;
296 }
297 
298 static bool
needs_negate(const fs_reg * reg,const struct imm * imm)299 needs_negate(const fs_reg *reg, const struct imm *imm)
300 {
301    switch (reg->type) {
302    case BRW_REGISTER_TYPE_DF:
303       return signbit(reg->df) != signbit(imm->df);
304    case BRW_REGISTER_TYPE_F:
305       return signbit(reg->f) != signbit(imm->f);
306    case BRW_REGISTER_TYPE_Q:
307       return (reg->d64 < 0) != (imm->d64 < 0);
308    case BRW_REGISTER_TYPE_D:
309       return (reg->d < 0) != (imm->d < 0);
310    case BRW_REGISTER_TYPE_HF:
311       return (reg->d & 0x8000u) != (imm->w & 0x8000u);
312    case BRW_REGISTER_TYPE_W:
313       return ((int16_t)reg->d < 0) != (imm->w < 0);
314    case BRW_REGISTER_TYPE_UQ:
315    case BRW_REGISTER_TYPE_UD:
316    case BRW_REGISTER_TYPE_UW:
317       return false;
318    default:
319       unreachable("not implemented");
320    };
321 }
322 
323 static bool
representable_as_hf(float f,uint16_t * hf)324 representable_as_hf(float f, uint16_t *hf)
325 {
326    union fi u;
327    uint16_t h = _mesa_float_to_half(f);
328    u.f = _mesa_half_to_float(h);
329 
330    if (u.f == f) {
331       *hf = h;
332       return true;
333    }
334 
335    return false;
336 }
337 
338 static bool
represent_src_as_imm(const struct gen_device_info * devinfo,fs_reg * src)339 represent_src_as_imm(const struct gen_device_info *devinfo,
340                      fs_reg *src)
341 {
342    /* TODO : consider specific platforms also */
343    if (devinfo->gen == 12) {
344       uint16_t hf;
345       if (representable_as_hf(src->f, &hf)) {
346          *src = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF);
347          return true;
348       }
349    }
350    return false;
351 }
352 
353 bool
opt_combine_constants()354 fs_visitor::opt_combine_constants()
355 {
356    void *const_ctx = ralloc_context(NULL);
357 
358    struct table table;
359    table.size = 8;
360    table.len = 0;
361    table.imm = ralloc_array(const_ctx, struct imm, table.size);
362 
363    const brw::idom_tree &idom = idom_analysis.require();
364    unsigned ip = -1;
365 
366    /* Make a pass through all instructions and count the number of times each
367     * constant is used by coissueable instructions or instructions that cannot
368     * take immediate arguments.
369     */
370    foreach_block_and_inst(block, fs_inst, inst, cfg) {
371       ip++;
372 
373       if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
374          continue;
375 
376       bool represented_as_imm = false;
377       for (int i = 0; i < inst->sources; i++) {
378          if (inst->src[i].file != IMM)
379             continue;
380 
381          if (!represented_as_imm && i == 0 &&
382              inst->opcode == BRW_OPCODE_MAD &&
383              represent_src_as_imm(devinfo, &inst->src[i])) {
384             represented_as_imm = true;
385             continue;
386          }
387 
388          char data[8];
389          brw_reg_type type;
390          if (!get_constant_value(devinfo, inst, i, data, &type))
391             continue;
392 
393          uint8_t size = type_sz(type);
394 
395          struct imm *imm = find_imm(&table, data, size);
396 
397          if (imm) {
398             bblock_t *intersection = idom.intersect(block, imm->block);
399             if (intersection != imm->block)
400                imm->inst = NULL;
401             imm->block = intersection;
402             imm->uses->push_tail(link(const_ctx, &inst->src[i]));
403             imm->uses_by_coissue += could_coissue(devinfo, inst);
404             imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);
405             imm->last_use_ip = ip;
406             if (type == BRW_REGISTER_TYPE_HF)
407                imm->is_half_float = true;
408          } else {
409             imm = new_imm(&table, const_ctx);
410             imm->block = block;
411             imm->inst = inst;
412             imm->uses = new(const_ctx) exec_list();
413             imm->uses->push_tail(link(const_ctx, &inst->src[i]));
414             memcpy(imm->bytes, data, size);
415             imm->size = size;
416             imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
417             imm->uses_by_coissue = could_coissue(devinfo, inst);
418             imm->must_promote = must_promote_imm(devinfo, inst);
419             imm->first_use_ip = ip;
420             imm->last_use_ip = ip;
421          }
422       }
423    }
424 
425    /* Remove constants from the table that don't have enough uses to make them
426     * profitable to store in a register.
427     */
428    for (int i = 0; i < table.len;) {
429       struct imm *imm = &table.imm[i];
430 
431       if (!imm->must_promote && imm->uses_by_coissue < 4) {
432          table.imm[i] = table.imm[table.len - 1];
433          table.len--;
434          continue;
435       }
436       i++;
437    }
438    if (table.len == 0) {
439       ralloc_free(const_ctx);
440       return false;
441    }
442    if (cfg->num_blocks != 1)
443       qsort(table.imm, table.len, sizeof(struct imm), compare);
444 
445    /* Insert MOVs to load the constant values into GRFs. */
446    fs_reg reg(VGRF, alloc.allocate(1));
447    reg.stride = 0;
448    for (int i = 0; i < table.len; i++) {
449       struct imm *imm = &table.imm[i];
450       /* Insert it either before the instruction that generated the immediate
451        * or after the last non-control flow instruction of the common ancestor.
452        */
453       exec_node *n = (imm->inst ? imm->inst :
454                       imm->block->last_non_control_flow_inst()->next);
455 
456       /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
457        *
458        *   "In Align16 mode, the channel selects and channel enables apply to a
459        *    pair of half-floats, because these parameters are defined for DWord
460        *    elements ONLY. This is applicable when both source and destination
461        *    are half-floats."
462        *
463        * This means that Align16 instructions that use promoted HF immediates
464        * and use a <0,1,0>:HF region would read 2 HF slots instead of
465        * replicating the single one we want. To avoid this, we always populate
466        * both HF slots within a DWord with the constant.
467        */
468       const uint32_t width = devinfo->gen == 8 && imm->is_half_float ? 2 : 1;
469       const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);
470 
471       /* Put the immediate in an offset aligned to its size. Some instructions
472        * seem to have additional alignment requirements, so account for that
473        * too.
474        */
475       reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
476 
477       /* Ensure we have enough space in the register to copy the immediate */
478       struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
479       if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) {
480          reg.nr = alloc.allocate(1);
481          reg.offset = 0;
482       }
483 
484       ibld.MOV(retype(reg, imm_reg.type), imm_reg);
485       imm->nr = reg.nr;
486       imm->subreg_offset = reg.offset;
487 
488       reg.offset += imm->size * width;
489    }
490    shader_stats.promoted_constants = table.len;
491 
492    /* Rewrite the immediate sources to refer to the new GRFs. */
493    for (int i = 0; i < table.len; i++) {
494       foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
495          fs_reg *reg = link->reg;
496 #ifdef DEBUG
497          switch (reg->type) {
498          case BRW_REGISTER_TYPE_DF:
499             assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
500                    (fabs(reg->df) == fabs(table.imm[i].df)));
501             break;
502          case BRW_REGISTER_TYPE_F:
503             assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
504                    (fabsf(reg->f) == fabsf(table.imm[i].f)));
505             break;
506          case BRW_REGISTER_TYPE_HF:
507             assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
508                     isnan(_mesa_half_to_float(table.imm[i].w))) ||
509                    (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
510                     fabsf(_mesa_half_to_float(table.imm[i].w))));
511             break;
512          case BRW_REGISTER_TYPE_Q:
513             assert(abs(reg->d64) == abs(table.imm[i].d64));
514             break;
515          case BRW_REGISTER_TYPE_UQ:
516             assert(reg->d64 == table.imm[i].d64);
517             break;
518          case BRW_REGISTER_TYPE_D:
519             assert(abs(reg->d) == abs(table.imm[i].d));
520             break;
521          case BRW_REGISTER_TYPE_UD:
522             assert(reg->d == table.imm[i].d);
523             break;
524          case BRW_REGISTER_TYPE_W:
525             assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
526             break;
527          case BRW_REGISTER_TYPE_UW:
528             assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
529             break;
530          default:
531             break;
532          }
533 #endif
534 
535          reg->file = VGRF;
536          reg->offset = table.imm[i].subreg_offset;
537          reg->stride = 0;
538          reg->negate = needs_negate(reg, &table.imm[i]);
539          reg->nr = table.imm[i].nr;
540       }
541    }
542 
543    if (debug) {
544       for (int i = 0; i < table.len; i++) {
545          struct imm *imm = &table.imm[i];
546 
547          printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
548                 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
549                 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
550                 imm->block->num,
551                 imm->nr,
552                 imm->subreg_offset,
553                 imm->must_promote,
554                 imm->uses_by_coissue,
555                 imm->first_use_ip,
556                 imm->last_use_ip,
557                 imm->last_use_ip - imm->first_use_ip);
558       }
559    }
560 
561    ralloc_free(const_ctx);
562    invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
563 
564    return true;
565 }
566