1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 #include "aco_builder.h"
27 #include <unordered_set>
28 #include <algorithm>
29 
30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
31 #include "amdgfxregs.h"
32 
33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
35 #define POS_EXP_WINDOW_SIZE 512
36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
40 #define POS_EXP_MAX_MOVES 512
41 
42 namespace aco {
43 
44 enum MoveResult {
45    move_success,
46    move_fail_ssa,
47    move_fail_rar,
48    move_fail_pressure,
49 };
50 
51 struct MoveState {
52    RegisterDemand max_registers;
53 
54    Block *block;
55    Instruction *current;
56    RegisterDemand *register_demand;
57    bool improved_rar;
58 
59    std::vector<bool> depends_on;
60    /* Two are needed because, for downwards VMEM scheduling, one needs to
61     * exclude the instructions in the clause, since new instructions in the
62     * clause are not moved past any other instructions in the clause. */
63    std::vector<bool> RAR_dependencies;
64    std::vector<bool> RAR_dependencies_clause;
65 
66    int source_idx;
67    int insert_idx, insert_idx_clause;
68    RegisterDemand total_demand, total_demand_clause;
69 
70    /* for moving instructions before the current instruction to after it */
71    void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
72    MoveResult downwards_move(bool clause);
73    void downwards_skip();
74 
75    /* for moving instructions after the first use of the current instruction upwards */
76    void upwards_init(int source_idx, bool improved_rar);
77    bool upwards_check_deps();
78    void upwards_set_insert_idx(int before);
79    MoveResult upwards_move();
80    void upwards_skip();
81 
82 private:
83    void downwards_advance_helper();
84 };
85 
86 struct sched_ctx {
87    int16_t num_waves;
88    int16_t last_SMEM_stall;
89    int last_SMEM_dep_idx;
90    MoveState mv;
91 };
92 
93 /* This scheduler is a simple bottom-up pass based on ideas from
94  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
95  * from Xiaohua Shi and Peng Guo.
96  * The basic approach is to iterate over all instructions. When a memory instruction
97  * is encountered it tries to move independent instructions from above and below
98  * between the memory instruction and it's first user.
99  * The novelty is that this scheduler cares for the current register pressure:
100  * Instructions will only be moved if the register pressure won't exceed a certain bound.
101  */
102 
103 template <typename T>
move_element(T begin_it,size_t idx,size_t before)104 void move_element(T begin_it, size_t idx, size_t before) {
105     if (idx < before) {
106         auto begin = std::next(begin_it, idx);
107         auto end = std::next(begin_it, before);
108         std::rotate(begin, begin + 1, end);
109     } else if (idx > before) {
110         auto begin = std::next(begin_it, before);
111         auto end = std::next(begin_it, idx + 1);
112         std::rotate(begin, end - 1, end);
113     }
114 }
115 
downwards_advance_helper()116 void MoveState::downwards_advance_helper()
117 {
118    source_idx--;
119    total_demand.update(register_demand[source_idx]);
120 }
121 
downwards_init(int current_idx,bool improved_rar_,bool may_form_clauses)122 void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
123 {
124    improved_rar = improved_rar_;
125    source_idx = current_idx;
126 
127    insert_idx = current_idx + 1;
128    insert_idx_clause = current_idx;
129 
130    total_demand = total_demand_clause = register_demand[current_idx];
131 
132    std::fill(depends_on.begin(), depends_on.end(), false);
133    if (improved_rar) {
134       std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
135       if (may_form_clauses)
136          std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
137    }
138 
139    for (const Operand& op : current->operands) {
140       if (op.isTemp()) {
141          depends_on[op.tempId()] = true;
142          if (improved_rar && op.isFirstKill())
143             RAR_dependencies[op.tempId()] = true;
144       }
145    }
146 
147    /* update total_demand/source_idx */
148    downwards_advance_helper();
149 }
150 
downwards_move(bool clause)151 MoveResult MoveState::downwards_move(bool clause)
152 {
153    aco_ptr<Instruction>& instr = block->instructions[source_idx];
154 
155    for (const Definition& def : instr->definitions)
156       if (def.isTemp() && depends_on[def.tempId()])
157          return move_fail_ssa;
158 
159    /* check if one of candidate's operands is killed by depending instruction */
160    std::vector<bool>& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
161    for (const Operand& op : instr->operands) {
162       if (op.isTemp() && RAR_deps[op.tempId()]) {
163          // FIXME: account for difference in register pressure
164          return move_fail_rar;
165       }
166    }
167 
168    if (clause) {
169       for (const Operand& op : instr->operands) {
170          if (op.isTemp()) {
171             depends_on[op.tempId()] = true;
172             if (op.isFirstKill())
173                RAR_dependencies[op.tempId()] = true;
174          }
175       }
176    }
177 
178    int dest_insert_idx = clause ? insert_idx_clause : insert_idx;
179    RegisterDemand register_pressure = clause ? total_demand_clause : total_demand;
180 
181    const RegisterDemand candidate_diff = get_live_changes(instr);
182    const RegisterDemand temp = get_temp_registers(instr);
183    if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
184       return move_fail_pressure;
185    const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
186    const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
187    if (new_demand.exceeds(max_registers))
188       return move_fail_pressure;
189 
190    /* move the candidate below the memory load */
191    move_element(block->instructions.begin(), source_idx, dest_insert_idx);
192 
193    /* update register pressure */
194    move_element(register_demand, source_idx, dest_insert_idx);
195    for (int i = source_idx; i < dest_insert_idx - 1; i++)
196       register_demand[i] -= candidate_diff;
197    register_demand[dest_insert_idx - 1] = new_demand;
198    total_demand_clause -= candidate_diff;
199    insert_idx_clause--;
200    if (!clause) {
201       total_demand -= candidate_diff;
202       insert_idx--;
203    }
204 
205    downwards_advance_helper();
206    return move_success;
207 }
208 
downwards_skip()209 void MoveState::downwards_skip()
210 {
211    aco_ptr<Instruction>& instr = block->instructions[source_idx];
212 
213    for (const Operand& op : instr->operands) {
214       if (op.isTemp()) {
215          depends_on[op.tempId()] = true;
216          if (improved_rar && op.isFirstKill()) {
217             RAR_dependencies[op.tempId()] = true;
218             RAR_dependencies_clause[op.tempId()] = true;
219          }
220       }
221    }
222    total_demand_clause.update(register_demand[source_idx]);
223 
224    downwards_advance_helper();
225 }
226 
upwards_init(int source_idx_,bool improved_rar_)227 void MoveState::upwards_init(int source_idx_, bool improved_rar_)
228 {
229    source_idx = source_idx_;
230    improved_rar = improved_rar_;
231 
232    insert_idx = -1;
233 
234    std::fill(depends_on.begin(), depends_on.end(), false);
235    std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
236 
237    for (const Definition& def : current->definitions) {
238       if (def.isTemp())
239          depends_on[def.tempId()] = true;
240    }
241 }
242 
upwards_check_deps()243 bool MoveState::upwards_check_deps()
244 {
245    aco_ptr<Instruction>& instr = block->instructions[source_idx];
246    for (const Operand& op : instr->operands) {
247       if (op.isTemp() && depends_on[op.tempId()])
248          return false;
249    }
250    return true;
251 }
252 
upwards_set_insert_idx(int before)253 void MoveState::upwards_set_insert_idx(int before)
254 {
255    insert_idx = before;
256    total_demand = register_demand[before - 1];
257 }
258 
upwards_move()259 MoveResult MoveState::upwards_move()
260 {
261    assert(insert_idx >= 0);
262 
263    aco_ptr<Instruction>& instr = block->instructions[source_idx];
264    for (const Operand& op : instr->operands) {
265       if (op.isTemp() && depends_on[op.tempId()])
266          return move_fail_ssa;
267    }
268 
269    /* check if candidate uses/kills an operand which is used by a dependency */
270    for (const Operand& op : instr->operands) {
271       if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
272          return move_fail_rar;
273    }
274 
275    /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
276    const RegisterDemand candidate_diff = get_live_changes(instr);
277    const RegisterDemand temp = get_temp_registers(instr);
278    if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers))
279       return move_fail_pressure;
280    const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]);
281    const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
282    if (new_demand.exceeds(max_registers))
283       return move_fail_pressure;
284 
285    /* move the candidate above the insert_idx */
286    move_element(block->instructions.begin(), source_idx, insert_idx);
287 
288    /* update register pressure */
289    move_element(register_demand, source_idx, insert_idx);
290    for (int i = insert_idx + 1; i <= source_idx; i++)
291       register_demand[i] += candidate_diff;
292    register_demand[insert_idx] = new_demand;
293    total_demand += candidate_diff;
294 
295    insert_idx++;
296 
297    total_demand.update(register_demand[source_idx]);
298    source_idx++;
299 
300    return move_success;
301 }
302 
upwards_skip()303 void MoveState::upwards_skip()
304 {
305    if (insert_idx >= 0) {
306       aco_ptr<Instruction>& instr = block->instructions[source_idx];
307       for (const Definition& def : instr->definitions) {
308          if (def.isTemp())
309             depends_on[def.tempId()] = true;
310       }
311       for (const Operand& op : instr->operands) {
312          if (op.isTemp())
313             RAR_dependencies[op.tempId()] = true;
314       }
315       total_demand.update(register_demand[source_idx]);
316    }
317 
318    source_idx++;
319 }
320 
is_gs_or_done_sendmsg(const Instruction * instr)321 bool is_gs_or_done_sendmsg(const Instruction *instr)
322 {
323    if (instr->opcode == aco_opcode::s_sendmsg) {
324       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
325       return (imm & sendmsg_id_mask) == _sendmsg_gs ||
326              (imm & sendmsg_id_mask) == _sendmsg_gs_done;
327    }
328    return false;
329 }
330 
is_done_sendmsg(const Instruction * instr)331 bool is_done_sendmsg(const Instruction *instr)
332 {
333    if (instr->opcode == aco_opcode::s_sendmsg) {
334       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
335       return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
336    }
337    return false;
338 }
339 
get_sync_info_with_hack(const Instruction * instr)340 memory_sync_info get_sync_info_with_hack(const Instruction* instr)
341 {
342    memory_sync_info sync = get_sync_info(instr);
343    if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
344       // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
345       sync.storage = (storage_class)(sync.storage | storage_buffer);
346       sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
347    }
348    return sync;
349 }
350 
351 struct memory_event_set {
352    bool has_control_barrier;
353 
354    unsigned bar_acquire;
355    unsigned bar_release;
356    unsigned bar_classes;
357 
358    unsigned access_acquire;
359    unsigned access_release;
360    unsigned access_relaxed;
361    unsigned access_atomic;
362 };
363 
364 struct hazard_query {
365    bool contains_spill;
366    bool contains_sendmsg;
367    memory_event_set mem_events;
368    unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
369    unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
370 };
371 
init_hazard_query(hazard_query * query)372 void init_hazard_query(hazard_query *query) {
373    query->contains_spill = false;
374    query->contains_sendmsg = false;
375    memset(&query->mem_events, 0, sizeof(query->mem_events));
376    query->aliasing_storage = 0;
377    query->aliasing_storage_smem = 0;
378 }
379 
add_memory_event(memory_event_set * set,Instruction * instr,memory_sync_info * sync)380 void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
381 {
382    set->has_control_barrier |= is_done_sendmsg(instr);
383    if (instr->opcode == aco_opcode::p_barrier) {
384       Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
385       if (bar->sync.semantics & semantic_acquire)
386          set->bar_acquire |= bar->sync.storage;
387       if (bar->sync.semantics & semantic_release)
388          set->bar_release |= bar->sync.storage;
389       set->bar_classes |= bar->sync.storage;
390 
391       set->has_control_barrier |= bar->exec_scope > scope_invocation;
392    }
393 
394    if (!sync->storage)
395       return;
396 
397    if (sync->semantics & semantic_acquire)
398       set->access_acquire |= sync->storage;
399    if (sync->semantics & semantic_release)
400       set->access_release |= sync->storage;
401 
402    if (!(sync->semantics & semantic_private)) {
403       if (sync->semantics & semantic_atomic)
404          set->access_atomic |= sync->storage;
405       else
406          set->access_relaxed |= sync->storage;
407    }
408 }
409 
add_to_hazard_query(hazard_query * query,Instruction * instr)410 void add_to_hazard_query(hazard_query *query, Instruction *instr)
411 {
412    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
413       query->contains_spill = true;
414    query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
415 
416    memory_sync_info sync = get_sync_info_with_hack(instr);
417 
418    add_memory_event(&query->mem_events, instr, &sync);
419 
420    if (!(sync.semantics & semantic_can_reorder)) {
421       unsigned storage = sync.storage;
422       /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
423       if (storage & (storage_buffer | storage_image))
424          storage |= storage_buffer | storage_image;
425       if (instr->format == Format::SMEM)
426          query->aliasing_storage_smem |= storage;
427       else
428          query->aliasing_storage |= storage;
429    }
430 }
431 
432 enum HazardResult {
433    hazard_success,
434    hazard_fail_reorder_vmem_smem,
435    hazard_fail_reorder_ds,
436    hazard_fail_reorder_sendmsg,
437    hazard_fail_spill,
438    hazard_fail_export,
439    hazard_fail_barrier,
440    /* Must stop at these failures. The hazard query code doesn't consider them
441     * when added. */
442    hazard_fail_exec,
443    hazard_fail_unreorderable,
444 };
445 
perform_hazard_query(hazard_query * query,Instruction * instr,bool upwards)446 HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
447 {
448    if (instr->opcode == aco_opcode::p_exit_early_if)
449       return hazard_fail_exec;
450    for (const Definition& def : instr->definitions) {
451       if (def.isFixed() && def.physReg() == exec)
452          return hazard_fail_exec;
453    }
454 
455    /* don't move exports so that they stay closer together */
456    if (instr->format == Format::EXP)
457       return hazard_fail_export;
458 
459    /* don't move non-reorderable instructions */
460    if (instr->opcode == aco_opcode::s_memtime ||
461        instr->opcode == aco_opcode::s_memrealtime ||
462        instr->opcode == aco_opcode::s_setprio ||
463        instr->opcode == aco_opcode::s_getreg_b32)
464       return hazard_fail_unreorderable;
465 
466    memory_event_set instr_set;
467    memset(&instr_set, 0, sizeof(instr_set));
468    memory_sync_info sync = get_sync_info_with_hack(instr);
469    add_memory_event(&instr_set, instr, &sync);
470 
471    memory_event_set *first = &instr_set;
472    memory_event_set *second = &query->mem_events;
473    if (upwards)
474       std::swap(first, second);
475 
476    /* everything after barrier(acquire) happens after the atomics/control_barriers before
477     * everything after load(acquire) happens after the load
478     */
479    if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
480       return hazard_fail_barrier;
481    if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
482        ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
483       return hazard_fail_barrier;
484 
485    /* everything before barrier(release) happens before the atomics/control_barriers after *
486     * everything before store(release) happens before the store
487     */
488    if (first->bar_release && (second->has_control_barrier || second->access_atomic))
489       return hazard_fail_barrier;
490    if ((first->bar_classes && (second->bar_release || second->access_release)) ||
491        ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
492       return hazard_fail_barrier;
493 
494    /* don't move memory barriers around other memory barriers */
495    if (first->bar_classes && second->bar_classes)
496       return hazard_fail_barrier;
497 
498    /* Don't move memory accesses to before control barriers. I don't think
499     * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
500    unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
501    if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
502       return hazard_fail_barrier;
503 
504    /* don't move memory loads/stores past potentially aliasing loads/stores */
505    unsigned aliasing_storage = instr->format == Format::SMEM ?
506                                query->aliasing_storage_smem :
507                                query->aliasing_storage;
508    if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
509       unsigned intersect = sync.storage & aliasing_storage;
510       if (intersect & storage_shared)
511          return hazard_fail_reorder_ds;
512       return hazard_fail_reorder_vmem_smem;
513    }
514 
515    if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
516        query->contains_spill)
517       return hazard_fail_spill;
518 
519    if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
520       return hazard_fail_reorder_sendmsg;
521 
522    return hazard_success;
523 }
524 
schedule_SMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)525 void schedule_SMEM(sched_ctx& ctx, Block* block,
526                    std::vector<RegisterDemand>& register_demand,
527                    Instruction* current, int idx)
528 {
529    assert(idx != 0);
530    int window_size = SMEM_WINDOW_SIZE;
531    int max_moves = SMEM_MAX_MOVES;
532    int16_t k = 0;
533 
534    /* don't move s_memtime/s_memrealtime */
535    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
536       return;
537 
538    /* first, check if we have instructions before current to move down */
539    hazard_query hq;
540    init_hazard_query(&hq);
541    add_to_hazard_query(&hq, current);
542 
543    ctx.mv.downwards_init(idx, false, false);
544 
545    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
546       assert(candidate_idx >= 0);
547       assert(candidate_idx == ctx.mv.source_idx);
548       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
549 
550       /* break if we'd make the previous SMEM instruction stall */
551       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
552       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
553          break;
554 
555       /* break when encountering another MEM instruction, logical_start or barriers */
556       if (candidate->opcode == aco_opcode::p_logical_start)
557          break;
558       if (candidate->isVMEM())
559          break;
560 
561       bool can_move_down = true;
562 
563       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
564       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
565          can_move_down = false;
566       else if (haz != hazard_success)
567          break;
568 
569       /* don't use LDS/GDS instructions to hide latency since it can
570        * significanly worsen LDS scheduling */
571       if (candidate->format == Format::DS || !can_move_down) {
572          add_to_hazard_query(&hq, candidate.get());
573          ctx.mv.downwards_skip();
574          continue;
575       }
576 
577       MoveResult res = ctx.mv.downwards_move(false);
578       if (res == move_fail_ssa || res == move_fail_rar) {
579          add_to_hazard_query(&hq, candidate.get());
580          ctx.mv.downwards_skip();
581          continue;
582       } else if (res == move_fail_pressure) {
583          break;
584       }
585 
586       if (candidate_idx < ctx.last_SMEM_dep_idx)
587          ctx.last_SMEM_stall++;
588       k++;
589    }
590 
591    /* find the first instruction depending on current or find another MEM */
592    ctx.mv.upwards_init(idx + 1, false);
593 
594    bool found_dependency = false;
595    /* second, check if we have instructions after current to move up */
596    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
597       assert(candidate_idx == ctx.mv.source_idx);
598       assert(candidate_idx < (int) block->instructions.size());
599       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
600 
601       if (candidate->opcode == aco_opcode::p_logical_end)
602          break;
603 
604       /* check if candidate depends on current */
605       bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps();
606       /* no need to steal from following VMEM instructions */
607       if (is_dependency && candidate->isVMEM())
608          break;
609 
610       if (found_dependency) {
611          HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
612          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
613              haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
614              haz == hazard_fail_export)
615             is_dependency = true;
616          else if (haz != hazard_success)
617             break;
618       }
619 
620       if (is_dependency) {
621          if (!found_dependency) {
622             ctx.mv.upwards_set_insert_idx(candidate_idx);
623             init_hazard_query(&hq);
624             found_dependency = true;
625          }
626       }
627 
628       if (is_dependency || !found_dependency) {
629          if (found_dependency)
630             add_to_hazard_query(&hq, candidate.get());
631          else
632             k++;
633          ctx.mv.upwards_skip();
634          continue;
635       }
636 
637       MoveResult res = ctx.mv.upwards_move();
638       if (res == move_fail_ssa || res == move_fail_rar) {
639          /* no need to steal from following VMEM instructions */
640          if (res == move_fail_ssa && candidate->isVMEM())
641             break;
642          add_to_hazard_query(&hq, candidate.get());
643          ctx.mv.upwards_skip();
644          continue;
645       } else if (res == move_fail_pressure) {
646          break;
647       }
648       k++;
649    }
650 
651    ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0;
652    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
653 }
654 
schedule_VMEM(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)655 void schedule_VMEM(sched_ctx& ctx, Block* block,
656                    std::vector<RegisterDemand>& register_demand,
657                    Instruction* current, int idx)
658 {
659    assert(idx != 0);
660    int window_size = VMEM_WINDOW_SIZE;
661    int max_moves = VMEM_MAX_MOVES;
662    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
663    int16_t k = 0;
664 
665    /* first, check if we have instructions before current to move down */
666    hazard_query indep_hq;
667    hazard_query clause_hq;
668    init_hazard_query(&indep_hq);
669    init_hazard_query(&clause_hq);
670    add_to_hazard_query(&indep_hq, current);
671 
672    ctx.mv.downwards_init(idx, true, true);
673 
674    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
675       assert(candidate_idx == ctx.mv.source_idx);
676       assert(candidate_idx >= 0);
677       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
678       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
679 
680       /* break when encountering another VMEM instruction, logical_start or barriers */
681       if (candidate->opcode == aco_opcode::p_logical_start)
682          break;
683 
684       /* break if we'd make the previous SMEM instruction stall */
685       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
686       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
687          break;
688 
689       bool part_of_clause = false;
690       if (current->isVMEM() == candidate->isVMEM()) {
691          bool same_resource = true;
692          if (current->isVMEM())
693             same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
694          int grab_dist = ctx.mv.insert_idx_clause - candidate_idx;
695          /* We can't easily tell how much this will decrease the def-to-use
696           * distances, so just use how far it will be moved as a heuristic. */
697          part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
698       }
699 
700       /* if current depends on candidate, add additional dependencies and continue */
701       bool can_move_down = !is_vmem || part_of_clause;
702 
703       HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
704       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
705           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
706           haz == hazard_fail_export)
707          can_move_down = false;
708       else if (haz != hazard_success)
709          break;
710 
711       if (!can_move_down) {
712          add_to_hazard_query(&indep_hq, candidate.get());
713          add_to_hazard_query(&clause_hq, candidate.get());
714          ctx.mv.downwards_skip();
715          continue;
716       }
717 
718       Instruction *candidate_ptr = candidate.get();
719       MoveResult res = ctx.mv.downwards_move(part_of_clause);
720       if (res == move_fail_ssa || res == move_fail_rar) {
721          add_to_hazard_query(&indep_hq, candidate.get());
722          add_to_hazard_query(&clause_hq, candidate.get());
723          ctx.mv.downwards_skip();
724          continue;
725       } else if (res == move_fail_pressure) {
726          break;
727       }
728       if (part_of_clause)
729          add_to_hazard_query(&indep_hq, candidate_ptr);
730       k++;
731       if (candidate_idx < ctx.last_SMEM_dep_idx)
732          ctx.last_SMEM_stall++;
733    }
734 
735    /* find the first instruction depending on current or find another VMEM */
736    ctx.mv.upwards_init(idx + 1, true);
737 
738    bool found_dependency = false;
739    /* second, check if we have instructions after current to move up */
740    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
741       assert(candidate_idx == ctx.mv.source_idx);
742       assert(candidate_idx < (int) block->instructions.size());
743       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
744       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
745 
746       if (candidate->opcode == aco_opcode::p_logical_end)
747          break;
748 
749       /* check if candidate depends on current */
750       bool is_dependency = false;
751       if (found_dependency) {
752          HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
753          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
754              haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
755              haz == hazard_fail_barrier || haz == hazard_fail_export)
756             is_dependency = true;
757          else if (haz != hazard_success)
758             break;
759       }
760 
761       is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps();
762       if (is_dependency) {
763          if (!found_dependency) {
764             ctx.mv.upwards_set_insert_idx(candidate_idx);
765             init_hazard_query(&indep_hq);
766             found_dependency = true;
767          }
768       } else if (is_vmem) {
769          /* don't move up dependencies of other VMEM instructions */
770          for (const Definition& def : candidate->definitions) {
771             if (def.isTemp())
772                ctx.mv.depends_on[def.tempId()] = true;
773          }
774       }
775 
776       if (is_dependency || !found_dependency) {
777          if (found_dependency)
778             add_to_hazard_query(&indep_hq, candidate.get());
779          ctx.mv.upwards_skip();
780          continue;
781       }
782 
783       MoveResult res = ctx.mv.upwards_move();
784       if (res == move_fail_ssa || res == move_fail_rar) {
785          add_to_hazard_query(&indep_hq, candidate.get());
786          ctx.mv.upwards_skip();
787          continue;
788       } else if (res == move_fail_pressure) {
789          break;
790       }
791       k++;
792    }
793 }
794 
schedule_position_export(sched_ctx & ctx,Block * block,std::vector<RegisterDemand> & register_demand,Instruction * current,int idx)795 void schedule_position_export(sched_ctx& ctx, Block* block,
796                               std::vector<RegisterDemand>& register_demand,
797                               Instruction* current, int idx)
798 {
799    assert(idx != 0);
800    int window_size = POS_EXP_WINDOW_SIZE;
801    int max_moves = POS_EXP_MAX_MOVES;
802    int16_t k = 0;
803 
804    ctx.mv.downwards_init(idx, true, false);
805 
806    hazard_query hq;
807    init_hazard_query(&hq);
808    add_to_hazard_query(&hq, current);
809 
810    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
811       assert(candidate_idx >= 0);
812       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
813 
814       if (candidate->opcode == aco_opcode::p_logical_start)
815          break;
816       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
817          break;
818 
819       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
820       if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
821          break;
822 
823       if (haz != hazard_success) {
824          add_to_hazard_query(&hq, candidate.get());
825          ctx.mv.downwards_skip();
826          continue;
827       }
828 
829       MoveResult res = ctx.mv.downwards_move(false);
830       if (res == move_fail_ssa || res == move_fail_rar) {
831          add_to_hazard_query(&hq, candidate.get());
832          ctx.mv.downwards_skip();
833          continue;
834       } else if (res == move_fail_pressure) {
835          break;
836       }
837       k++;
838    }
839 }
840 
schedule_block(sched_ctx & ctx,Program * program,Block * block,live & live_vars)841 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
842 {
843    ctx.last_SMEM_dep_idx = 0;
844    ctx.last_SMEM_stall = INT16_MIN;
845    ctx.mv.block = block;
846    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
847 
848    /* go through all instructions and find memory loads */
849    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
850       Instruction* current = block->instructions[idx].get();
851 
852       if (current->definitions.empty())
853          continue;
854 
855       if (current->isVMEM() || current->isFlatOrGlobal()) {
856          ctx.mv.current = current;
857          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
858       }
859 
860       if (current->format == Format::SMEM) {
861          ctx.mv.current = current;
862          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
863       }
864    }
865 
866    if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) {
867       /* Try to move position exports as far up as possible, to reduce register
868        * usage and because ISA reference guides say so. */
869       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
870          Instruction* current = block->instructions[idx].get();
871 
872          if (current->format == Format::EXP) {
873             unsigned target = static_cast<Export_instruction*>(current)->dest;
874             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) {
875                ctx.mv.current = current;
876                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
877             }
878          }
879       }
880    }
881 
882    /* resummarize the block's register demand */
883    block->register_demand = RegisterDemand();
884    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
885       block->register_demand.update(live_vars.register_demand[block->index][idx]);
886    }
887 }
888 
889 
schedule_program(Program * program,live & live_vars)890 void schedule_program(Program *program, live& live_vars)
891 {
892    /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
893    RegisterDemand demand;
894    for (Block& block : program->blocks)
895       demand.update(block.register_demand);
896 
897    sched_ctx ctx;
898    ctx.mv.depends_on.resize(program->peekAllocationId());
899    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
900    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
901    /* Allowing the scheduler to reduce the number of waves to as low as 5
902     * improves performance of Thrones of Britannia significantly and doesn't
903     * seem to hurt anything else. */
904    if (program->num_waves <= 5)
905       ctx.num_waves = program->num_waves;
906    else if (demand.vgpr >= 29)
907       ctx.num_waves = 5;
908    else if (demand.vgpr >= 25)
909       ctx.num_waves = 6;
910    else
911       ctx.num_waves = 7;
912    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
913    ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->max_waves);
914 
915    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
916    ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
917                             int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
918 
919    for (Block& block : program->blocks)
920       schedule_block(ctx, program, &block, live_vars);
921 
922    /* update max_reg_demand and num_waves */
923    RegisterDemand new_demand;
924    for (Block& block : program->blocks) {
925       new_demand.update(block.register_demand);
926    }
927    update_vgpr_sgpr_demand(program, new_demand);
928 
929    /* if enabled, this code asserts that register_demand is updated correctly */
930    #if 0
931    int prev_num_waves = program->num_waves;
932    const RegisterDemand prev_max_demand = program->max_reg_demand;
933 
934    std::vector<RegisterDemand> demands(program->blocks.size());
935    for (unsigned j = 0; j < program->blocks.size(); j++) {
936       demands[j] = program->blocks[j].register_demand;
937    }
938 
939    struct radv_nir_compiler_options options;
940    options.chip_class = program->chip_class;
941    live live_vars2 = aco::live_var_analysis(program, &options);
942 
943    for (unsigned j = 0; j < program->blocks.size(); j++) {
944       Block &b = program->blocks[j];
945       for (unsigned i = 0; i < b.instructions.size(); i++)
946          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
947       assert(b.register_demand == demands[j]);
948    }
949 
950    assert(program->max_reg_demand == prev_max_demand);
951    assert(program->num_waves == prev_num_waves);
952    #endif
953 }
954 
955 }
956