1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_combine_constants.cpp
25  *
26  * This file contains the opt_combine_constants() pass that runs after the
27  * regular optimization loop. It passes over the instruction list and
28  * selectively promotes immediate values to registers by emitting a mov(1)
29  * instruction.
30  *
31  * This is useful on Gen 7 particularly, because a few instructions can be
32  * coissued (i.e., issued in the same cycle as another thread on the same EU
33  * issues an instruction) under some circumstances, one of which is that they
34  * cannot use immediate values.
35  */
36 
37 #include "brw_fs.h"
38 #include "brw_cfg.h"
39 #include "util/half_float.h"
40 
41 using namespace brw;
42 
43 static const bool debug = false;
44 
45 /* Returns whether an instruction could co-issue if its immediate source were
46  * replaced with a GRF source.
47  */
48 static bool
could_coissue(const struct intel_device_info * devinfo,const fs_inst * inst)49 could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst)
50 {
51    if (devinfo->ver != 7)
52       return false;
53 
54    switch (inst->opcode) {
55    case BRW_OPCODE_MOV:
56    case BRW_OPCODE_CMP:
57    case BRW_OPCODE_ADD:
58    case BRW_OPCODE_MUL:
59       /* Only float instructions can coissue.  We don't have a great
60        * understanding of whether or not something like float(int(a) + int(b))
61        * would be considered float (based on the destination type) or integer
62        * (based on the source types), so we take the conservative choice of
63        * only promoting when both destination and source are float.
64        */
65       return inst->dst.type == BRW_REGISTER_TYPE_F &&
66              inst->src[0].type == BRW_REGISTER_TYPE_F;
67    default:
68       return false;
69    }
70 }
71 
72 /**
73  * Returns true for instructions that don't support immediate sources.
74  */
75 static bool
must_promote_imm(const struct intel_device_info * devinfo,const fs_inst * inst)76 must_promote_imm(const struct intel_device_info *devinfo, const fs_inst *inst)
77 {
78    switch (inst->opcode) {
79    case SHADER_OPCODE_POW:
80       return devinfo->ver < 8;
81    case BRW_OPCODE_MAD:
82    case BRW_OPCODE_ADD3:
83    case BRW_OPCODE_LRP:
84       return true;
85    default:
86       return false;
87    }
88 }
89 
90 /** A box for putting fs_regs in a linked list. */
91 struct reg_link {
92    DECLARE_RALLOC_CXX_OPERATORS(reg_link)
93 
reg_linkreg_link94    reg_link(fs_reg *reg) : reg(reg) {}
95 
96    struct exec_node link;
97    fs_reg *reg;
98 };
99 
100 static struct exec_node *
link(void * mem_ctx,fs_reg * reg)101 link(void *mem_ctx, fs_reg *reg)
102 {
103    reg_link *l = new(mem_ctx) reg_link(reg);
104    return &l->link;
105 }
106 
107 /**
108  * Information about an immediate value.
109  */
110 struct imm {
111    /** The common ancestor of all blocks using this immediate value. */
112    bblock_t *block;
113 
114    /**
115     * The instruction generating the immediate value, if all uses are contained
116     * within a single basic block. Otherwise, NULL.
117     */
118    fs_inst *inst;
119 
120    /**
121     * A list of fs_regs that refer to this immediate.  If we promote it, we'll
122     * have to patch these up to refer to the new GRF.
123     */
124    exec_list *uses;
125 
126    /** The immediate value */
127    union {
128       char bytes[8];
129       double df;
130       int64_t d64;
131       float f;
132       int32_t d;
133       int16_t w;
134    };
135    uint8_t size;
136 
137    /** When promoting half-float we need to account for certain restrictions */
138    bool is_half_float;
139 
140    /**
141     * The GRF register and subregister number where we've decided to store the
142     * constant value.
143     */
144    uint8_t subreg_offset;
145    uint16_t nr;
146 
147    /** The number of coissuable instructions using this immediate. */
148    uint16_t uses_by_coissue;
149 
150    /**
151     * Whether this constant is used by an instruction that can't handle an
152     * immediate source (and already has to be promoted to a GRF).
153     */
154    bool must_promote;
155 
156    uint16_t first_use_ip;
157    uint16_t last_use_ip;
158 };
159 
160 /** The working set of information about immediates. */
161 struct table {
162    struct imm *imm;
163    int size;
164    int len;
165 };
166 
167 static struct imm *
find_imm(struct table * table,void * data,uint8_t size)168 find_imm(struct table *table, void *data, uint8_t size)
169 {
170    for (int i = 0; i < table->len; i++) {
171       if (table->imm[i].size == size &&
172           !memcmp(table->imm[i].bytes, data, size)) {
173          return &table->imm[i];
174       }
175    }
176    return NULL;
177 }
178 
179 static struct imm *
new_imm(struct table * table,void * mem_ctx)180 new_imm(struct table *table, void *mem_ctx)
181 {
182    if (table->len == table->size) {
183       table->size *= 2;
184       table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
185    }
186    return &table->imm[table->len++];
187 }
188 
189 /**
190  * Comparator used for sorting an array of imm structures.
191  *
192  * We sort by basic block number, then last use IP, then first use IP (least
193  * to greatest). This sorting causes immediates live in the same area to be
194  * allocated to the same register in the hopes that all values will be dead
195  * about the same time and the register can be reused.
196  */
197 static int
compare(const void * _a,const void * _b)198 compare(const void *_a, const void *_b)
199 {
200    const struct imm *a = (const struct imm *)_a,
201                     *b = (const struct imm *)_b;
202 
203    int block_diff = a->block->num - b->block->num;
204    if (block_diff)
205       return block_diff;
206 
207    int end_diff = a->last_use_ip - b->last_use_ip;
208    if (end_diff)
209       return end_diff;
210 
211    return a->first_use_ip - b->first_use_ip;
212 }
213 
214 static bool
get_constant_value(const struct intel_device_info * devinfo,const fs_inst * inst,uint32_t src_idx,void * out,brw_reg_type * out_type)215 get_constant_value(const struct intel_device_info *devinfo,
216                    const fs_inst *inst, uint32_t src_idx,
217                    void *out, brw_reg_type *out_type)
218 {
219    const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
220    const fs_reg *src = &inst->src[src_idx];
221 
222    *out_type = src->type;
223 
224    switch (*out_type) {
225    case BRW_REGISTER_TYPE_DF: {
226       double val = !can_do_source_mods ? src->df : fabs(src->df);
227       memcpy(out, &val, 8);
228       break;
229    }
230    case BRW_REGISTER_TYPE_F: {
231       float val = !can_do_source_mods ? src->f : fabsf(src->f);
232       memcpy(out, &val, 4);
233       break;
234    }
235    case BRW_REGISTER_TYPE_HF: {
236       uint16_t val = src->d & 0xffffu;
237       if (can_do_source_mods)
238          val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
239       memcpy(out, &val, 2);
240       break;
241    }
242    case BRW_REGISTER_TYPE_Q: {
243       int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64);
244       memcpy(out, &val, 8);
245       break;
246    }
247    case BRW_REGISTER_TYPE_UQ:
248       memcpy(out, &src->u64, 8);
249       break;
250    case BRW_REGISTER_TYPE_D: {
251       int32_t val = !can_do_source_mods ? src->d : abs(src->d);
252       memcpy(out, &val, 4);
253       break;
254    }
255    case BRW_REGISTER_TYPE_UD:
256       memcpy(out, &src->ud, 4);
257       break;
258    case BRW_REGISTER_TYPE_W: {
259       int16_t val = src->d & 0xffffu;
260       if (can_do_source_mods)
261          val = abs(val);
262       memcpy(out, &val, 2);
263       break;
264    }
265    case BRW_REGISTER_TYPE_UW:
266       memcpy(out, &src->ud, 2);
267       break;
268    default:
269       return false;
270    };
271 
272    return true;
273 }
274 
275 static struct brw_reg
build_imm_reg_for_copy(struct imm * imm)276 build_imm_reg_for_copy(struct imm *imm)
277 {
278    switch (imm->size) {
279    case 8:
280       return brw_imm_d(imm->d64);
281    case 4:
282       return brw_imm_d(imm->d);
283    case 2:
284       return brw_imm_w(imm->w);
285    default:
286       unreachable("not implemented");
287    }
288 }
289 
290 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)291 get_alignment_for_imm(const struct imm *imm)
292 {
293    if (imm->is_half_float)
294       return 4; /* At least MAD seems to require this */
295    else
296       return imm->size;
297 }
298 
299 static bool
needs_negate(const fs_reg * reg,const struct imm * imm)300 needs_negate(const fs_reg *reg, const struct imm *imm)
301 {
302    switch (reg->type) {
303    case BRW_REGISTER_TYPE_DF:
304       return signbit(reg->df) != signbit(imm->df);
305    case BRW_REGISTER_TYPE_F:
306       return signbit(reg->f) != signbit(imm->f);
307    case BRW_REGISTER_TYPE_Q:
308       return (reg->d64 < 0) != (imm->d64 < 0);
309    case BRW_REGISTER_TYPE_D:
310       return (reg->d < 0) != (imm->d < 0);
311    case BRW_REGISTER_TYPE_HF:
312       return (reg->d & 0x8000u) != (imm->w & 0x8000u);
313    case BRW_REGISTER_TYPE_W:
314       return ((int16_t)reg->d < 0) != (imm->w < 0);
315    case BRW_REGISTER_TYPE_UQ:
316    case BRW_REGISTER_TYPE_UD:
317    case BRW_REGISTER_TYPE_UW:
318       return false;
319    default:
320       unreachable("not implemented");
321    };
322 }
323 
324 static bool
representable_as_hf(float f,uint16_t * hf)325 representable_as_hf(float f, uint16_t *hf)
326 {
327    union fi u;
328    uint16_t h = _mesa_float_to_half(f);
329    u.f = _mesa_half_to_float(h);
330 
331    if (u.f == f) {
332       *hf = h;
333       return true;
334    }
335 
336    return false;
337 }
338 
339 static bool
representable_as_w(int d,int16_t * w)340 representable_as_w(int d, int16_t *w)
341 {
342    int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
343    if (!res) {
344       *w = d;
345       return true;
346    }
347 
348    return false;
349 }
350 
351 static bool
representable_as_uw(unsigned ud,uint16_t * uw)352 representable_as_uw(unsigned ud, uint16_t *uw)
353 {
354    if (!(ud & 0xffff0000)) {
355       *uw = ud;
356       return true;
357    }
358 
359    return false;
360 }
361 
362 static bool
supports_src_as_imm(const struct intel_device_info * devinfo,enum opcode op)363 supports_src_as_imm(const struct intel_device_info *devinfo, enum opcode op)
364 {
365    switch (op) {
366    case BRW_OPCODE_ADD3:
367       return devinfo->verx10 >= 125;
368    case BRW_OPCODE_MAD:
369       return devinfo->ver == 12 && devinfo->verx10 < 125;
370    default:
371       return false;
372    }
373 }
374 
375 static bool
can_promote_src_as_imm(const struct intel_device_info * devinfo,fs_inst * inst,unsigned src_idx)376 can_promote_src_as_imm(const struct intel_device_info *devinfo, fs_inst *inst,
377                        unsigned src_idx)
378 {
379    bool can_promote = false;
380 
381    /* Experiment shows that we can only support src0 as immediate */
382    if (src_idx != 0)
383       return false;
384 
385    if (!supports_src_as_imm(devinfo, inst->opcode))
386       return false;
387 
388    /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
389     *        since HF/F mixed mode has been removed from the hardware.
390     */
391    switch (inst->src[src_idx].type) {
392    case BRW_REGISTER_TYPE_F: {
393       uint16_t hf;
394       if (representable_as_hf(inst->src[src_idx].f, &hf)) {
395          inst->src[src_idx] = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF);
396          can_promote = true;
397       }
398       break;
399    }
400    case BRW_REGISTER_TYPE_W: {
401       int16_t w;
402       if (representable_as_w(inst->src[src_idx].d, &w)) {
403          inst->src[src_idx] = brw_imm_w(w);
404          can_promote = true;
405       }
406       break;
407    }
408    case BRW_REGISTER_TYPE_UW: {
409       uint16_t uw;
410       if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
411          inst->src[src_idx] = brw_imm_uw(uw);
412          can_promote = true;
413       }
414       break;
415    }
416    default:
417       break;
418    }
419 
420    return can_promote;
421 }
422 
423 bool
opt_combine_constants()424 fs_visitor::opt_combine_constants()
425 {
426    void *const_ctx = ralloc_context(NULL);
427 
428    struct table table;
429    table.size = 8;
430    table.len = 0;
431    table.imm = ralloc_array(const_ctx, struct imm, table.size);
432 
433    const brw::idom_tree &idom = idom_analysis.require();
434    unsigned ip = -1;
435 
436    /* Make a pass through all instructions and count the number of times each
437     * constant is used by coissueable instructions or instructions that cannot
438     * take immediate arguments.
439     */
440    foreach_block_and_inst(block, fs_inst, inst, cfg) {
441       ip++;
442 
443       if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
444          continue;
445 
446       for (int i = 0; i < inst->sources; i++) {
447          if (inst->src[i].file != IMM)
448             continue;
449 
450          if (can_promote_src_as_imm(devinfo, inst, i))
451             continue;
452 
453          char data[8];
454          brw_reg_type type;
455          if (!get_constant_value(devinfo, inst, i, data, &type))
456             continue;
457 
458          uint8_t size = type_sz(type);
459 
460          struct imm *imm = find_imm(&table, data, size);
461 
462          if (imm) {
463             bblock_t *intersection = idom.intersect(block, imm->block);
464             if (intersection != imm->block)
465                imm->inst = NULL;
466             imm->block = intersection;
467             imm->uses->push_tail(link(const_ctx, &inst->src[i]));
468             imm->uses_by_coissue += could_coissue(devinfo, inst);
469             imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);
470             imm->last_use_ip = ip;
471             if (type == BRW_REGISTER_TYPE_HF)
472                imm->is_half_float = true;
473          } else {
474             imm = new_imm(&table, const_ctx);
475             imm->block = block;
476             imm->inst = inst;
477             imm->uses = new(const_ctx) exec_list();
478             imm->uses->push_tail(link(const_ctx, &inst->src[i]));
479             memcpy(imm->bytes, data, size);
480             imm->size = size;
481             imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
482             imm->uses_by_coissue = could_coissue(devinfo, inst);
483             imm->must_promote = must_promote_imm(devinfo, inst);
484             imm->first_use_ip = ip;
485             imm->last_use_ip = ip;
486          }
487       }
488    }
489 
490    /* Remove constants from the table that don't have enough uses to make them
491     * profitable to store in a register.
492     */
493    for (int i = 0; i < table.len;) {
494       struct imm *imm = &table.imm[i];
495 
496       if (!imm->must_promote && imm->uses_by_coissue < 4) {
497          table.imm[i] = table.imm[table.len - 1];
498          table.len--;
499          continue;
500       }
501       i++;
502    }
503    if (table.len == 0) {
504       ralloc_free(const_ctx);
505       return false;
506    }
507    if (cfg->num_blocks != 1)
508       qsort(table.imm, table.len, sizeof(struct imm), compare);
509 
510    /* Insert MOVs to load the constant values into GRFs. */
511    fs_reg reg(VGRF, alloc.allocate(1));
512    reg.stride = 0;
513    for (int i = 0; i < table.len; i++) {
514       struct imm *imm = &table.imm[i];
515       /* Insert it either before the instruction that generated the immediate
516        * or after the last non-control flow instruction of the common ancestor.
517        */
518       exec_node *n = (imm->inst ? imm->inst :
519                       imm->block->last_non_control_flow_inst()->next);
520 
521       /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
522        *
523        *   "In Align16 mode, the channel selects and channel enables apply to a
524        *    pair of half-floats, because these parameters are defined for DWord
525        *    elements ONLY. This is applicable when both source and destination
526        *    are half-floats."
527        *
528        * This means that Align16 instructions that use promoted HF immediates
529        * and use a <0,1,0>:HF region would read 2 HF slots instead of
530        * replicating the single one we want. To avoid this, we always populate
531        * both HF slots within a DWord with the constant.
532        */
533       const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;
534       const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);
535 
536       /* Put the immediate in an offset aligned to its size. Some instructions
537        * seem to have additional alignment requirements, so account for that
538        * too.
539        */
540       reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
541 
542       /* Ensure we have enough space in the register to copy the immediate */
543       struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
544       if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) {
545          reg.nr = alloc.allocate(1);
546          reg.offset = 0;
547       }
548 
549       ibld.MOV(retype(reg, imm_reg.type), imm_reg);
550       imm->nr = reg.nr;
551       imm->subreg_offset = reg.offset;
552 
553       reg.offset += imm->size * width;
554    }
555    shader_stats.promoted_constants = table.len;
556 
557    /* Rewrite the immediate sources to refer to the new GRFs. */
558    for (int i = 0; i < table.len; i++) {
559       foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
560          fs_reg *reg = link->reg;
561 #ifdef DEBUG
562          switch (reg->type) {
563          case BRW_REGISTER_TYPE_DF:
564             assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
565                    (fabs(reg->df) == fabs(table.imm[i].df)));
566             break;
567          case BRW_REGISTER_TYPE_F:
568             assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
569                    (fabsf(reg->f) == fabsf(table.imm[i].f)));
570             break;
571          case BRW_REGISTER_TYPE_HF:
572             assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
573                     isnan(_mesa_half_to_float(table.imm[i].w))) ||
574                    (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
575                     fabsf(_mesa_half_to_float(table.imm[i].w))));
576             break;
577          case BRW_REGISTER_TYPE_Q:
578             assert(abs(reg->d64) == abs(table.imm[i].d64));
579             break;
580          case BRW_REGISTER_TYPE_UQ:
581             assert(reg->d64 == table.imm[i].d64);
582             break;
583          case BRW_REGISTER_TYPE_D:
584             assert(abs(reg->d) == abs(table.imm[i].d));
585             break;
586          case BRW_REGISTER_TYPE_UD:
587             assert(reg->d == table.imm[i].d);
588             break;
589          case BRW_REGISTER_TYPE_W:
590             assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
591             break;
592          case BRW_REGISTER_TYPE_UW:
593             assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
594             break;
595          default:
596             break;
597          }
598 #endif
599 
600          reg->file = VGRF;
601          reg->offset = table.imm[i].subreg_offset;
602          reg->stride = 0;
603          reg->negate = needs_negate(reg, &table.imm[i]);
604          reg->nr = table.imm[i].nr;
605       }
606    }
607 
608    if (debug) {
609       for (int i = 0; i < table.len; i++) {
610          struct imm *imm = &table.imm[i];
611 
612          printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
613                 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
614                 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
615                 imm->block->num,
616                 imm->nr,
617                 imm->subreg_offset,
618                 imm->must_promote,
619                 imm->uses_by_coissue,
620                 imm->first_use_ip,
621                 imm->last_use_ip,
622                 imm->last_use_ip - imm->first_use_ip);
623       }
624    }
625 
626    ralloc_free(const_ctx);
627    invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
628 
629    return true;
630 }
631