1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include "util/crc32.h"
28 
29 #include <algorithm>
30 #include <deque>
31 #include <set>
32 #include <vector>
33 
34 namespace aco {
35 
36 /* sgpr_presched/vgpr_presched */
37 void
collect_presched_stats(Program * program)38 collect_presched_stats(Program* program)
39 {
40    RegisterDemand presched_demand;
41    for (Block& block : program->blocks)
42       presched_demand.update(block.register_demand);
43    program->statistics[statistic_sgpr_presched] = presched_demand.sgpr;
44    program->statistics[statistic_vgpr_presched] = presched_demand.vgpr;
45 }
46 
47 class BlockCycleEstimator {
48 public:
49    enum resource {
50       null = 0,
51       scalar,
52       branch_sendmsg,
53       valu,
54       valu_complex,
55       lds,
56       export_gds,
57       vmem,
58       resource_count,
59    };
60 
BlockCycleEstimator(Program * program_)61    BlockCycleEstimator(Program* program_) : program(program_) {}
62 
63    Program* program;
64 
65    int32_t cur_cycle = 0;
66    int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
67    unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
68    int32_t reg_available[512] = {0};
69    std::deque<int32_t> lgkm;
70    std::deque<int32_t> exp;
71    std::deque<int32_t> vm;
72    std::deque<int32_t> vs;
73 
74    unsigned predict_cost(aco_ptr<Instruction>& instr);
75    void add(aco_ptr<Instruction>& instr);
76    void join(const BlockCycleEstimator& other);
77 
78 private:
79    unsigned get_waitcnt_cost(wait_imm imm);
80    unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
81 
82    void use_resources(aco_ptr<Instruction>& instr);
83    int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
84 };
85 
86 struct wait_counter_info {
wait_counter_infoaco::wait_counter_info87    wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
88        : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
89    {}
90 
91    unsigned vm;
92    unsigned exp;
93    unsigned lgkm;
94    unsigned vs;
95 };
96 
97 struct perf_info {
98    int latency;
99 
100    BlockCycleEstimator::resource rsrc0;
101    unsigned cost0;
102 
103    BlockCycleEstimator::resource rsrc1;
104    unsigned cost1;
105 };
106 
107 static perf_info
get_perf_info(Program * program,aco_ptr<Instruction> & instr)108 get_perf_info(Program* program, aco_ptr<Instruction>& instr)
109 {
110    instr_class cls = instr_info.classes[(int)instr->opcode];
111 
112 #define WAIT(res)          BlockCycleEstimator::res, 0
113 #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
114 
115    if (program->chip_class >= GFX10) {
116       /* fp64 might be incorrect */
117       switch (cls) {
118       case instr_class::valu32:
119       case instr_class::valu_convert32:
120       case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
121       case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
122       case instr_class::valu_quarter_rate32:
123          return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
124       case instr_class::valu_transcendental32:
125          return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
126       case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
127       case instr_class::valu_double_add:
128          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
129       case instr_class::valu_double_convert:
130          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
131       case instr_class::valu_double_transcendental:
132          return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
133       case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
134       case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
135       case instr_class::branch:
136       case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
137       case instr_class::ds:
138          return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
139                                 : perf_info{0, WAIT_USE(lds, 1)};
140       case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
141       case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
142       case instr_class::barrier:
143       case instr_class::waitcnt:
144       case instr_class::other:
145       default: return {0};
146       }
147    } else {
148       switch (cls) {
149       case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
150       case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
151       case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
152       case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
153       case instr_class::valu_fma:
154          return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
155                                             : perf_info{16, WAIT_USE(valu, 16)};
156       case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
157       case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
158       case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
159       case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
160       case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
161       case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
162       case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
163       case instr_class::branch:
164          return {8, WAIT_USE(branch_sendmsg, 8)};
165          return {4, WAIT_USE(branch_sendmsg, 4)};
166       case instr_class::ds:
167          return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
168                                 : perf_info{4, WAIT_USE(lds, 4)};
169       case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
170       case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
171       case instr_class::barrier:
172       case instr_class::waitcnt:
173       case instr_class::other:
174       default: return {4};
175       }
176    }
177 
178 #undef WAIT_USE
179 #undef WAIT
180 }
181 
182 void
use_resources(aco_ptr<Instruction> & instr)183 BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
184 {
185    perf_info perf = get_perf_info(program, instr);
186 
187    if (perf.rsrc0 != resource_count) {
188       res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
189       res_usage[(int)perf.rsrc0] += perf.cost0;
190    }
191 
192    if (perf.rsrc1 != resource_count) {
193       res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
194       res_usage[(int)perf.rsrc1] += perf.cost1;
195    }
196 }
197 
198 int32_t
cycles_until_res_available(aco_ptr<Instruction> & instr)199 BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
200 {
201    perf_info perf = get_perf_info(program, instr);
202 
203    int32_t cost = 0;
204    if (perf.rsrc0 != resource_count)
205       cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
206    if (perf.rsrc1 != resource_count)
207       cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
208 
209    return cost;
210 }
211 
212 static wait_counter_info
get_wait_counter_info(aco_ptr<Instruction> & instr)213 get_wait_counter_info(aco_ptr<Instruction>& instr)
214 {
215    /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
216     * depends a lot on the situation. */
217 
218    if (instr->isEXP())
219       return wait_counter_info(0, 16, 0, 0);
220 
221    if (instr->isFlatLike()) {
222       unsigned lgkm = instr->isFlat() ? 20 : 0;
223       if (!instr->definitions.empty())
224          return wait_counter_info(230, 0, lgkm, 0);
225       else
226          return wait_counter_info(0, 0, lgkm, 230);
227    }
228 
229    if (instr->isSMEM()) {
230       if (instr->definitions.empty())
231          return wait_counter_info(0, 0, 200, 0);
232       if (instr->operands.empty()) /* s_memtime and s_memrealtime */
233          return wait_counter_info(0, 0, 1, 0);
234 
235       bool likely_desc_load = instr->operands[0].size() == 2;
236       bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
237       bool const_offset =
238          instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
239 
240       if (likely_desc_load || const_offset)
241          return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
242 
243       return wait_counter_info(0, 0, 200, 0);
244    }
245 
246    if (instr->format == Format::DS)
247       return wait_counter_info(0, 0, 20, 0);
248 
249    if (instr->isVMEM() && !instr->definitions.empty())
250       return wait_counter_info(320, 0, 0, 0);
251 
252    if (instr->isVMEM() && instr->definitions.empty())
253       return wait_counter_info(0, 0, 0, 320);
254 
255    return wait_counter_info(0, 0, 0, 0);
256 }
257 
258 static wait_imm
get_wait_imm(Program * program,aco_ptr<Instruction> & instr)259 get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
260 {
261    if (instr->opcode == aco_opcode::s_endpgm) {
262       return wait_imm(0, 0, 0, 0);
263    } else if (instr->opcode == aco_opcode::s_waitcnt) {
264       return wait_imm(GFX10_3, instr->sopp().imm);
265    } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
266       return wait_imm(0, 0, 0, instr->sopk().imm);
267    } else {
268       unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14;
269       unsigned max_exp_cnt = 6;
270       unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14;
271       unsigned max_vs_cnt = 62;
272 
273       wait_counter_info wait_info = get_wait_counter_info(instr);
274       wait_imm imm;
275       imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter;
276       imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter;
277       imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter;
278       imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter;
279       return imm;
280    }
281 }
282 
283 unsigned
get_dependency_cost(aco_ptr<Instruction> & instr)284 BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
285 {
286    int deps_available = cur_cycle;
287 
288    wait_imm imm = get_wait_imm(program, instr);
289    if (imm.vm != wait_imm::unset_counter) {
290       for (int i = 0; i < (int)vm.size() - imm.vm; i++)
291          deps_available = MAX2(deps_available, vm[i]);
292    }
293    if (imm.exp != wait_imm::unset_counter) {
294       for (int i = 0; i < (int)exp.size() - imm.exp; i++)
295          deps_available = MAX2(deps_available, exp[i]);
296    }
297    if (imm.lgkm != wait_imm::unset_counter) {
298       for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++)
299          deps_available = MAX2(deps_available, lgkm[i]);
300    }
301    if (imm.vs != wait_imm::unset_counter) {
302       for (int i = 0; i < (int)vs.size() - imm.vs; i++)
303          deps_available = MAX2(deps_available, vs[i]);
304    }
305 
306    if (instr->opcode == aco_opcode::s_endpgm) {
307       for (unsigned i = 0; i < 512; i++)
308          deps_available = MAX2(deps_available, reg_available[i]);
309    } else if (program->chip_class >= GFX10) {
310       for (Operand& op : instr->operands) {
311          if (op.isConstant() || op.isUndefined())
312             continue;
313          for (unsigned i = 0; i < op.size(); i++)
314             deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
315       }
316    }
317 
318    if (program->chip_class < GFX10)
319       deps_available = align(deps_available, 4);
320 
321    return deps_available - cur_cycle;
322 }
323 
324 unsigned
predict_cost(aco_ptr<Instruction> & instr)325 BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
326 {
327    int32_t dep = get_dependency_cost(instr);
328    return dep + std::max(cycles_until_res_available(instr) - dep, 0);
329 }
330 
331 static bool
is_vector(aco_opcode op)332 is_vector(aco_opcode op)
333 {
334    switch (instr_info.classes[(int)op]) {
335    case instr_class::valu32:
336    case instr_class::valu_convert32:
337    case instr_class::valu_fma:
338    case instr_class::valu_double:
339    case instr_class::valu_double_add:
340    case instr_class::valu_double_convert:
341    case instr_class::valu_double_transcendental:
342    case instr_class::vmem:
343    case instr_class::ds:
344    case instr_class::exp:
345    case instr_class::valu64:
346    case instr_class::valu_quarter_rate32:
347    case instr_class::valu_transcendental32: return true;
348    default: return false;
349    }
350 }
351 
352 void
add(aco_ptr<Instruction> & instr)353 BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
354 {
355    perf_info perf = get_perf_info(program, instr);
356 
357    cur_cycle += get_dependency_cost(instr);
358 
359    unsigned start;
360    bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 &&
361                      is_vector(instr->opcode) && program->workgroup_size > 32;
362    for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
363       cur_cycle += cycles_until_res_available(instr);
364 
365       start = cur_cycle;
366       use_resources(instr);
367 
368       /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
369       cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency;
370    }
371 
372    wait_imm imm = get_wait_imm(program, instr);
373    while (lgkm.size() > imm.lgkm)
374       lgkm.pop_front();
375    while (exp.size() > imm.exp)
376       exp.pop_front();
377    while (vm.size() > imm.vm)
378       vm.pop_front();
379    while (vs.size() > imm.vs)
380       vs.pop_front();
381 
382    wait_counter_info wait_info = get_wait_counter_info(instr);
383    if (wait_info.exp)
384       exp.push_back(cur_cycle + wait_info.exp);
385    if (wait_info.lgkm)
386       lgkm.push_back(cur_cycle + wait_info.lgkm);
387    if (wait_info.vm)
388       vm.push_back(cur_cycle + wait_info.vm);
389    if (wait_info.vs)
390       vs.push_back(cur_cycle + wait_info.vs);
391 
392    /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
393     * Before waitcnt insertion, this is necessary to consider memory operations.
394     */
395    int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm);
396    int32_t result_available = start + MAX2(perf.latency, latency);
397 
398    for (Definition& def : instr->definitions) {
399       int32_t* available = &reg_available[def.physReg().reg()];
400       for (unsigned i = 0; i < def.size(); i++)
401          available[i] = MAX2(available[i], result_available);
402    }
403 }
404 
405 static void
join_queue(std::deque<int32_t> & queue,const std::deque<int32_t> & pred,int cycle_diff)406 join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
407 {
408    for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
409       queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
410    for (int i = pred.size() - queue.size() - 1; i >= 0; i--)
411       queue.push_front(pred[i] + cycle_diff);
412 }
413 
414 void
join(const BlockCycleEstimator & pred)415 BlockCycleEstimator::join(const BlockCycleEstimator& pred)
416 {
417    assert(cur_cycle == 0);
418 
419    for (unsigned i = 0; i < (unsigned)resource_count; i++) {
420       assert(res_usage[i] == 0);
421       res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
422    }
423 
424    for (unsigned i = 0; i < 512; i++)
425       reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
426 
427    join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
428    join_queue(exp, pred.exp, -pred.cur_cycle);
429    join_queue(vm, pred.vm, -pred.cur_cycle);
430    join_queue(vs, pred.vs, -pred.cur_cycle);
431 }
432 
433 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
434 void
collect_preasm_stats(Program * program)435 collect_preasm_stats(Program* program)
436 {
437    for (Block& block : program->blocks) {
438       std::set<Instruction*> vmem_clause;
439       std::set<Instruction*> smem_clause;
440 
441       program->statistics[statistic_instructions] += block.instructions.size();
442 
443       for (aco_ptr<Instruction>& instr : block.instructions) {
444          if (instr->isSOPP() && instr->sopp().block != -1)
445             program->statistics[statistic_branches]++;
446 
447          if (instr->opcode == aco_opcode::p_constaddr)
448             program->statistics[statistic_instructions] += 2;
449 
450          if (instr->isVMEM() && !instr->operands.empty()) {
451             if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
452                              [&](Instruction* other)
453                              { return should_form_clause(instr.get(), other); }))
454                program->statistics[statistic_vmem_clauses]++;
455             vmem_clause.insert(instr.get());
456          } else {
457             vmem_clause.clear();
458          }
459 
460          if (instr->isSMEM() && !instr->operands.empty()) {
461             if (std::none_of(smem_clause.begin(), smem_clause.end(),
462                              [&](Instruction* other)
463                              { return should_form_clause(instr.get(), other); }))
464                program->statistics[statistic_smem_clauses]++;
465             smem_clause.insert(instr.get());
466          } else {
467             smem_clause.clear();
468          }
469       }
470    }
471 
472    double latency = 0;
473    double usage[(int)BlockCycleEstimator::resource_count] = {0};
474    std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
475 
476    if (program->stage.has(SWStage::VS) && program->info->vs.has_prolog) {
477       unsigned vs_input_latency = 320;
478       for (Definition def : program->vs_inputs) {
479          blocks[0].vm.push_back(vs_input_latency);
480          for (unsigned i = 0; i < def.size(); i++)
481             blocks[0].reg_available[def.physReg().reg() + i] = vs_input_latency;
482       }
483    }
484 
485    for (Block& block : program->blocks) {
486       BlockCycleEstimator& block_est = blocks[block.index];
487       for (unsigned pred : block.linear_preds)
488          block_est.join(blocks[pred]);
489 
490       for (aco_ptr<Instruction>& instr : block.instructions) {
491          unsigned before = block_est.cur_cycle;
492          block_est.add(instr);
493          instr->pass_flags = block_est.cur_cycle - before;
494       }
495 
496       /* TODO: it would be nice to be able to consider estimated loop trip
497        * counts used for loop unrolling.
498        */
499 
500       /* TODO: estimate the trip_count of divergent loops (those which break
501        * divergent) higher than of uniform loops
502        */
503 
504       /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
505        * and any lane in the wave takes a side of a divergent branch 75% of the
506        * time.
507        */
508       double iter = 1.0;
509       iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
510       iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
511       iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
512       iter *= pow(0.5, block.uniform_if_depth);
513       iter *= pow(0.75, block.divergent_if_logical_depth);
514 
515       bool divergent_if_linear_else =
516          block.logical_preds.empty() && block.linear_preds.size() == 1 &&
517          block.linear_succs.size() == 1 &&
518          program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
519       if (divergent_if_linear_else)
520          iter *= 0.25;
521 
522       latency += block_est.cur_cycle * iter;
523       for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
524          usage[i] += block_est.res_usage[i] * iter;
525    }
526 
527    /* This likely exaggerates the effectiveness of parallelism because it
528     * ignores instruction ordering. It can assume there might be SALU/VALU/etc
529     * work to from other waves while one is idle but that might not be the case
530     * because those other waves have not reached such a point yet.
531     */
532 
533    double parallelism = program->num_waves;
534    for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
535       if (usage[i] > 0.0)
536          parallelism = MIN2(parallelism, latency / usage[i]);
537    }
538    double waves_per_cycle = 1.0 / latency * parallelism;
539    double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
540 
541    double max_utilization = 1.0;
542    if (program->workgroup_size != UINT_MAX)
543       max_utilization =
544          program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
545    wave64_per_cycle *= max_utilization;
546 
547    program->statistics[statistic_latency] = round(latency);
548    program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
549 
550    if (debug_flags & DEBUG_PERF_INFO) {
551       aco_print_program(program, stderr, print_no_ssa | print_perf_info);
552 
553       fprintf(stderr, "num_waves: %u\n", program->num_waves);
554       fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
555       fprintf(stderr, "branch_sendmsg_usage: %f\n",
556               usage[(int)BlockCycleEstimator::branch_sendmsg]);
557       fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
558       fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
559       fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
560       fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
561       fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
562       fprintf(stderr, "latency: %f\n", latency);
563       fprintf(stderr, "parallelism: %f\n", parallelism);
564       fprintf(stderr, "max_utilization: %f\n", max_utilization);
565       fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
566       fprintf(stderr, "\n");
567    }
568 }
569 
570 void
collect_postasm_stats(Program * program,const std::vector<uint32_t> & code)571 collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
572 {
573    program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
574 }
575 
576 } // namespace aco
577