1 //! This module implements a late-stage redundant-reload remover, which runs after registers have
2 //! been allocated and stack slots have been given specific offsets.
3
4 use crate::cursor::{Cursor, CursorPosition, EncCursor, FuncCursor};
5 use crate::entity::EntitySet;
6 use crate::flowgraph::ControlFlowGraph;
7 use crate::ir::dfg::DataFlowGraph;
8 use crate::ir::instructions::BranchInfo;
9 use crate::ir::stackslot::{StackSlotKind, StackSlots};
10 use crate::ir::{
11 Block, Function, Inst, InstBuilder, InstructionData, Opcode, StackSlotData, Type, Value,
12 ValueLoc,
13 };
14 use crate::isa::{RegInfo, RegUnit, TargetIsa};
15 use crate::regalloc::RegDiversions;
16 use alloc::vec::Vec;
17 use core::convert::TryInto;
18 use cranelift_entity::{PrimaryMap, SecondaryMap};
19
20 // =============================================================================================
21 // A description of the redundant-fill-removal algorithm
22 //
23 //
24 // The algorithm works forwards through each Block. It carries along and updates a table,
25 // AvailEnv, with which it tracks registers that are known to have the same value as some stack
26 // slot. The actions on encountering an instruction depend on the instruction, as follows:
27 //
28 // ss1 = spill r0: update the AvailEnv so as to note that slot `ss1` and register `r0`
29 // have the same value.
30 //
31 // r1 = fill ss0: look in the AvailEnv. If it tells us that register `r1` and slot `ss0`
32 // have the same value, then delete the instruction by converting it to a
33 // `fill_nop`.
34 //
35 // If it tells us that some other register `r2` has the same value as
36 // slot `ss0`, convert the instruction into a copy from `r2` to `r1`.
37 //
38 // any other insn: remove from the AvailEnv, any bindings associated with registers
39 // written by this instruction, since they will be invalidated by it.
40 //
41 // Tracking the effects of `copy` instructions in AvailEnv for the case when both source and
42 // destination are registers does not cause any more fills to be removed or converted to copies.
43 // It's not clear why.
44 //
45 // There are various other instruction-handling cases in `visit_inst`, which are documented
46 // in-line, and do not change the core algorithm, so are not described here.
47 //
48 // The registers tracked by AvailEnv are the post-diversion registers that are really used by the
49 // code; they are not the pre-diversion names associated with each SSA `Value`. The second
50 // `fill` case above opportunistically copies values from registers that may have been diversion
51 // targets in some predecessor block, and so are no longer associated with any specific SSA-level
52 // name at the point the copy is made. Hence those copies (from `r2` to `r1`) cannot be done
53 // with an ordinary `copy` instruction. Instead they have to be done using a new `copy_to_ssa`
54 // instruction, which copies from an arbitrary register to a register-resident `Value` (that is,
55 // "back to" SSA-world).
56 //
57 // That completes the description of the core algorithm.
58 //
59 // In the case where a block `A` jumps to `B` and `A` is the only predecessor of `B`, the
60 // AvailEnv at the end of `A` will still be valid at the entry to `B`. In such a case, we can
61 // profitably transform `B` using the AvailEnv "inherited" from `A`. In order to take full
62 // advantage of this, this module partitions the function's CFG into tree-shaped groups of
63 // blocks, and processes each tree as described above. So the AvailEnv is only initialised to
64 // empty at the start of blocks that form the root of each tree; that is, for blocks which have
65 // two or more predecessors.
66
67 // =============================================================================================
68 // Top level algorithm structure
69 //
70 // The overall algorithm, for a function, starts like this:
71 //
72 // * (once per function): finds Blocks that have two or more predecessors, since they will be the
73 // roots of Block trees. Also, the entry node for the function is considered to be a root.
74 //
75 // It then continues with a loop that first finds a tree of Blocks ("discovery") and then removes
76 // redundant fills as described above ("processing"):
77 //
78 // * (discovery; once per tree): for each root, performs a depth first search to find all the Blocks
79 // in the tree, guided by RedundantReloadRemover::discovery_stack.
80 //
81 // * (processing; once per tree): the just-discovered tree is then processed as described above,
82 // guided by RedundantReloadRemover::processing_stack.
83 //
84 // In this way, all Blocks reachable from the function's entry point are eventually processed. Note
85 // that each tree is processed as soon as it has been discovered, so the algorithm never creates a
86 // list of trees for the function.
87 //
88 // The running state is stored in `RedundantReloadRemover`. This is allocated once and can be
89 // reused for multiple functions so as to minimise heap turnover. The fields are, roughly:
90 //
91 // num_regunits -- constant for the whole function; used by the tree processing phase
92 // num_preds_per_block -- constant for the whole function; used by the tree discovery process
93 //
94 // discovery_stack -- used to guide the tree discovery process
95 // nodes_in_tree -- the discovered nodes are recorded here
96 //
97 // processing_stack -- used to guide the tree processing process
98 // nodes_already_visited -- used to ensure the tree processing logic terminates in the case
99 // where a tree has a branch back to its root node.
100 //
101 // There is further documentation in line below, as appropriate.
102
103 // =============================================================================================
104 // A side note on register choice heuristics
105
106 // The core algorithm opportunistically replaces fill instructions when it knows of a register
107 // that already holds the required value. How effective this is largely depends on how long
108 // reloaded values happen to stay alive before the relevant register is overwritten. And that
109 // depends on the register allocator's register choice heuristics. The worst case is, when the
110 // register allocator reuses registers as soon as possible after they become free. Unfortunately
111 // that was indeed the selection scheme, prior to development of this pass.
112 //
113 // As part of this work, the register selection scheme has been changed as follows: for registers
114 // written by any instruction other than a fill, use the lowest numbered available register. But
115 // for registers written by a fill instruction, use the highest numbered available register. The
116 // aim is to try and keep reload- and non-reload registers disjoint to the extent possible.
117 // Several other schemes were tried, but this one is simple and can be worth an extra 2% of
118 // performance in some cases.
119 //
120 // The relevant change is more or less a one-line change in the solver.
121
122 // =============================================================================================
123 // Data structures used for discovery of trees
124
125 // `ZeroOneOrMany` is used to record the number of predecessors a Block block has. The `Zero` case
126 // is included so as to cleanly handle the case where the incoming graph has unreachable Blocks.
127
128 #[derive(Clone, PartialEq)]
129 enum ZeroOneOrMany {
130 Zero,
131 One,
132 Many,
133 }
134
135 // =============================================================================================
136 // Data structures used for processing of trees
137
138 // `SlotInfo` describes a spill slot in the obvious way. Note that it doesn't indicate which
139 // register(s) are currently associated with the slot. That job is done by `AvailEnv` instead.
140 //
141 // In the CL framework, stack slots are partitioned into disjoint sets, one for each
142 // `StackSlotKind`. The offset and size only give a unique identity within any particular
143 // `StackSlotKind`. So, to uniquely identify a stack slot, all three fields are necessary.
144
145 #[derive(Clone, Copy)]
146 struct SlotInfo {
147 kind: StackSlotKind,
148 offset: i32,
149 size: u32,
150 }
151
152 // `AvailEnv` maps each possible register to a stack slot that holds the same value. The index
153 // space of `AvailEnv::map` is exactly the set of registers available on the current target. If
154 // (as is mostly the case) a register is not known to have the same value as a stack slot, then
155 // its entry is `None` rather than `Some(..)`.
156 //
157 // Invariants for AvailEnv:
158 //
159 // AvailEnv may have multiple different registers bound to the same stack slot -- that is, `(kind,
160 // offset, size)` triple. That's OK, and reflects the reality that those two registers contain
161 // the same value. This could happen, for example, in the case
162 //
163 // ss1 = spill r0
164 // ..
165 // r2 = fill ss1
166 //
167 // Then both `r0` and `r2` will have the same value as `ss1`, provided that ".." doesn't write to
168 // `r1`.
169 //
170 // To say that two different registers may be bound to the same stack slot is the same as saying
171 // that it is allowed to have two different entries in AvailEnv with the same `(kind, offset,
172 // size)` triple. What is *not* allowed is to have partial overlaps. That is, if two SlotInfos
173 // have the same `kind` field and have `offset` and `size` fields that overlap, then their
174 // `offset` and `size` fields must be identical. This is so as to make the algorithm safe against
175 // situations where, for example, a 64 bit register is spilled, but then only the bottom 32 bits
176 // are reloaded from the slot.
177 //
178 // Although in such a case it seems likely that the Cranelift IR would be ill-typed, and so this
179 // case could probably not occur in practice.
180
181 #[derive(Clone)]
182 struct AvailEnv {
183 map: Vec<Option<SlotInfo>>,
184 }
185
186 // `ProcessingStackElem` combines AvailEnv with contextual information needed to "navigate" within
187 // a Block.
188 //
189 // A ProcessingStackElem conceptually has the lifetime of exactly one Block: once the current Block is
190 // completed, the ProcessingStackElem will be abandoned. In practice the top level state,
191 // RedundantReloadRemover, caches them, so as to avoid heap turnover.
192 //
193 // Note that ProcessingStackElem must contain a CursorPosition. The CursorPosition, which
194 // indicates where we are in the current Block, cannot be implicitly maintained by looping over all
195 // the instructions in a Block in turn, because we may choose to suspend processing the current Block
196 // at a side exit, continue by processing the subtree reached via the side exit, and only later
197 // resume the current Block.
198
199 struct ProcessingStackElem {
200 /// Indicates the AvailEnv at the current point in the Block.
201 avail_env: AvailEnv,
202
203 /// Shows where we currently are inside the Block.
204 cursor: CursorPosition,
205
206 /// Indicates the currently active register diversions at the current point.
207 diversions: RegDiversions,
208 }
209
210 // =============================================================================================
211 // The top level data structure
212
213 // `RedundantReloadRemover` contains data structures for the two passes: discovery of tree shaped
214 // regions, and processing of them. These are allocated once and stay alive for the entire
215 // function, even though they are cleared out for each new tree shaped region. It also caches
216 // `num_regunits` and `num_preds_per_block`, which are computed at the start of each function and
217 // then remain constant.
218
219 /// The redundant reload remover's state.
220 pub struct RedundantReloadRemover {
221 /// The total number of RegUnits available on this architecture. This is unknown when the
222 /// RedundantReloadRemover is created. It becomes known at the beginning of processing of a
223 /// function.
224 num_regunits: Option<u16>,
225
226 /// This stores, for each Block, a characterisation of the number of predecessors it has.
227 num_preds_per_block: PrimaryMap<Block, ZeroOneOrMany>,
228
229 /// The stack used for the first phase (discovery). There is one element on the discovery
230 /// stack for each currently unexplored Block in the tree being searched.
231 discovery_stack: Vec<Block>,
232
233 /// The nodes in the discovered tree are inserted here.
234 nodes_in_tree: EntitySet<Block>,
235
236 /// The stack used during the second phase (transformation). There is one element on the
237 /// processing stack for each currently-open node in the tree being transformed.
238 processing_stack: Vec<ProcessingStackElem>,
239
240 /// Used in the second phase to avoid visiting nodes more than once.
241 nodes_already_visited: EntitySet<Block>,
242 }
243
244 // =============================================================================================
245 // Miscellaneous small helper functions
246
247 // Is this a kind of stack slot that is safe to track in AvailEnv? This is probably overly
248 // conservative, but tracking only the SpillSlot and IncomingArgument kinds catches almost all
249 // available redundancy in practice.
is_slot_kind_tracked(kind: StackSlotKind) -> bool250 fn is_slot_kind_tracked(kind: StackSlotKind) -> bool {
251 match kind {
252 StackSlotKind::SpillSlot | StackSlotKind::IncomingArg => true,
253 _ => false,
254 }
255 }
256
257 // Find out if the range `[offset, +size)` overlaps with the range in `si`.
overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool258 fn overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool {
259 let a_offset = si.offset as i64;
260 let a_size = si.size as i64;
261 let b_offset = offset as i64;
262 let b_size = size as i64;
263 let no_overlap = a_offset + a_size <= b_offset || b_offset + b_size <= a_offset;
264 !no_overlap
265 }
266
267 // Find, in `reginfo`, the register bank that `reg` lives in, and return the lower limit and size
268 // of the bank. This is so the caller can conveniently iterate over all RegUnits in the bank that
269 // `reg` lives in.
find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16)270 fn find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16) {
271 if let Some(bank) = reginfo.bank_containing_regunit(reg) {
272 return (bank.first_unit, bank.units);
273 }
274 // We should never get here, since `reg` must come from *some* RegBank.
275 panic!("find_regclass_limits: reg not found");
276 }
277
278 // Returns the register that `v` is allocated to. Assumes that `v` actually resides in a
279 // register.
reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit280 fn reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit {
281 match locations[v] {
282 ValueLoc::Reg(ru) => ru,
283 _ => panic!("reg_of_value: value isn't in a reg"),
284 }
285 }
286
287 // Returns the stack slot that `v` is allocated to. Assumes that `v` actually resides in a stack
288 // slot.
slot_of_value<'s>( locations: &SecondaryMap<Value, ValueLoc>, stack_slots: &'s StackSlots, v: Value, ) -> &'s StackSlotData289 fn slot_of_value<'s>(
290 locations: &SecondaryMap<Value, ValueLoc>,
291 stack_slots: &'s StackSlots,
292 v: Value,
293 ) -> &'s StackSlotData {
294 match locations[v] {
295 ValueLoc::Stack(slot) => &stack_slots[slot],
296 _ => panic!("slot_of_value: value isn't in a stack slot"),
297 }
298 }
299
300 // =============================================================================================
301 // Top level: discovery of tree shaped regions
302
303 impl RedundantReloadRemover {
304 // A helper for `add_nodes_to_tree` below.
discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Block)305 fn discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Block) {
306 for successor in cfg.succ_iter(node) {
307 self.discovery_stack.push(successor);
308 }
309 }
310
311 // Visit the tree of Blocks rooted at `starting_point` and add them to `self.nodes_in_tree`.
312 // `self.num_preds_per_block` guides the process, ensuring we don't leave the tree-ish region
313 // and indirectly ensuring that the process will terminate in the presence of cycles in the
314 // graph. `self.discovery_stack` holds the search state in this function.
add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Block)315 fn add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Block) {
316 // One might well ask why this doesn't loop forever when it encounters cycles in the
317 // control flow graph. The reason is that any cycle in the graph that is reachable from
318 // anywhere outside the cycle -- in particular, that is reachable from the function's
319 // entry node -- must have at least one node that has two or more predecessors. So the
320 // logic below won't follow into it, because it regards any such node as the root of some
321 // other tree.
322 debug_assert!(self.discovery_stack.is_empty());
323 debug_assert!(self.nodes_in_tree.is_empty());
324
325 self.nodes_in_tree.insert(starting_point);
326 self.discovery_stack_push_successors_of(cfg, starting_point);
327
328 while let Some(node) = self.discovery_stack.pop() {
329 match self.num_preds_per_block[node] {
330 // We arrived at a node with multiple predecessors, so it's a new root. Ignore it.
331 ZeroOneOrMany::Many => {}
332 // This node has just one predecessor, so we should incorporate it in the tree and
333 // immediately transition into searching from it instead.
334 ZeroOneOrMany::One => {
335 self.nodes_in_tree.insert(node);
336 self.discovery_stack_push_successors_of(cfg, node);
337 }
338 // This is meaningless. We arrived at a node that doesn't point back at where we
339 // came from.
340 ZeroOneOrMany::Zero => panic!("add_nodes_to_tree: inconsistent graph"),
341 }
342 }
343 }
344 }
345
346 // =============================================================================================
347 // Operations relating to `AvailEnv`
348
349 impl AvailEnv {
350 // Create a new one.
new(size: usize) -> Self351 fn new(size: usize) -> Self {
352 let mut env = Self {
353 map: Vec::<Option<SlotInfo>>::new(),
354 };
355 env.map.resize(size, None);
356 env
357 }
358
359 // Debug only: checks (some of) the required AvailEnv invariants.
360 #[cfg(debug_assertions)]
check_invariants(&self) -> bool361 fn check_invariants(&self) -> bool {
362 // Check that any overlapping entries overlap exactly. This is super lame (quadratic),
363 // but it's only used in debug builds.
364 for i in 0..self.map.len() {
365 if let Some(si) = self.map[i] {
366 for j in i + 1..self.map.len() {
367 if let Some(sj) = self.map[j] {
368 // "si and sj overlap, but not exactly"
369 if si.kind == sj.kind
370 && overlaps(&si, sj.offset, sj.size)
371 && !(si.offset == sj.offset && si.size == sj.size)
372 {
373 return false;
374 }
375 }
376 }
377 }
378 }
379 true
380 }
381
382 // Invalidates the binding associated with `reg`. Note that by construction of AvailEnv,
383 // `reg` can only be associated with one binding at once.
invalidate_by_reg(&mut self, reg: RegUnit)384 fn invalidate_by_reg(&mut self, reg: RegUnit) {
385 self.map[reg as usize] = None;
386 }
387
388 // Invalidates any binding that has any overlap with `(kind, offset, size)`.
invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32)389 fn invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32) {
390 debug_assert!(is_slot_kind_tracked(kind));
391 for i in 0..self.map.len() {
392 if let Some(si) = &self.map[i] {
393 if si.kind == kind && overlaps(&si, offset, size) {
394 self.map[i] = None;
395 }
396 }
397 }
398 }
399
400 // Invalidates all bindings.
invalidate_all(&mut self)401 fn invalidate_all(&mut self) {
402 for i in 0..self.map.len() {
403 self.map[i] = None;
404 }
405 }
406
407 // Updates AvailEnv to track the effect of a `regmove` instruction.
copy_reg(&mut self, src: RegUnit, dst: RegUnit)408 fn copy_reg(&mut self, src: RegUnit, dst: RegUnit) {
409 self.map[dst as usize] = self.map[src as usize];
410 }
411
412 // Does `env` have the exact binding characterised by `(reg, kind, offset, size)` ?
has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool413 fn has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool {
414 debug_assert!(is_slot_kind_tracked(kind));
415 if let Some(si) = &self.map[reg as usize] {
416 return si.kind == kind && si.offset == offset && si.size == size;
417 }
418 // No such binding.
419 false
420 }
421
422 // Does `env` have a binding characterised by `(kind, offset, size)` but to a register, let's
423 // call it `other_reg`, that isn't `reg`? If so, return `other_reg`. Note that `other_reg`
424 // will have the same bank as `reg`. It is a checked error to call this function with a
425 // binding matching all four of `(reg, kind, offset, size)`.
has_inexact_binding( &self, reginfo: &RegInfo, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32, ) -> Option<RegUnit>426 fn has_inexact_binding(
427 &self,
428 reginfo: &RegInfo,
429 reg: RegUnit,
430 kind: StackSlotKind,
431 offset: i32,
432 size: u32,
433 ) -> Option<RegUnit> {
434 debug_assert!(is_slot_kind_tracked(kind));
435 // Find the range of RegUnit numbers for the bank that contains `reg`, and use that as our
436 // search space. This is so as to guarantee that any match is restricted to the same bank
437 // as `reg`.
438 let (first_unit, num_units) = find_bank_limits(reginfo, reg);
439 for other_reg in first_unit..first_unit + num_units {
440 if let Some(si) = &self.map[other_reg as usize] {
441 if si.kind == kind && si.offset == offset && si.size == size {
442 if other_reg == reg {
443 panic!("has_inexact_binding: binding *is* exact!");
444 }
445 return Some(other_reg);
446 }
447 }
448 }
449 // No such binding.
450 None
451 }
452
453 // Create the binding `(reg, kind, offset, size)` in `env`, and throw away any previous
454 // binding associated with either `reg` or the `(kind, offset, size)` triple.
bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32)455 fn bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) {
456 debug_assert!(is_slot_kind_tracked(kind));
457 self.invalidate_by_offset(kind, offset, size);
458 self.map[reg as usize] = Some(SlotInfo { kind, offset, size });
459 }
460 }
461
462 // Invalidates in `avail_env`, any binding associated with a regunit that is written by `inst`.
invalidate_regs_written_by_inst( locations: &SecondaryMap<Value, ValueLoc>, diversions: &RegDiversions, dfg: &DataFlowGraph, avail_env: &mut AvailEnv, inst: Inst, )463 fn invalidate_regs_written_by_inst(
464 locations: &SecondaryMap<Value, ValueLoc>,
465 diversions: &RegDiversions,
466 dfg: &DataFlowGraph,
467 avail_env: &mut AvailEnv,
468 inst: Inst,
469 ) {
470 for v in dfg.inst_results(inst).iter() {
471 if let ValueLoc::Reg(ru) = locations[*v] {
472 // This must be true. It would be meaningless for an SSA value to be diverted before
473 // the point where it is defined.
474 debug_assert!(diversions.reg(*v, locations) == ru);
475 avail_env.invalidate_by_reg(ru);
476 }
477 }
478 }
479
480 // =============================================================================================
481 // Processing of individual instructions
482
483 impl RedundantReloadRemover {
484 // Process `inst`, possibly changing it into a different instruction, and possibly changing
485 // `self.avail_env` and `func.dfg`.
visit_inst( &mut self, func: &mut Function, reginfo: &RegInfo, isa: &dyn TargetIsa, inst: Inst, )486 fn visit_inst(
487 &mut self,
488 func: &mut Function,
489 reginfo: &RegInfo,
490 isa: &dyn TargetIsa,
491 inst: Inst,
492 ) {
493 // Get hold of the top-of-stack work item. This is the state that we will mutate during
494 // processing of this instruction.
495 debug_assert!(!self.processing_stack.is_empty());
496 let ProcessingStackElem {
497 avail_env,
498 diversions,
499 ..
500 } = self.processing_stack.last_mut().unwrap();
501
502 #[cfg(debug_assertions)]
503 debug_assert!(
504 avail_env.check_invariants(),
505 "visit_inst: env invariants not ok"
506 );
507
508 let dfg = &mut func.dfg;
509 let locations = &func.locations;
510 let stack_slots = &func.stack_slots;
511
512 // To avoid difficulties with the borrow checker, do this in two stages. First, examine
513 // the instruction to see if it can be deleted or modified, and park the relevant
514 // information in `transform`. Update `self.avail_env` too. Later, use `transform` to
515 // actually do the transformation if necessary.
516 enum Transform {
517 NoChange,
518 ChangeToNopFill(Value), // delete this insn entirely
519 ChangeToCopyToSSA(Type, RegUnit), // change it into a copy from the specified reg
520 }
521 let mut transform = Transform::NoChange;
522
523 // In this match { .. } statement, either we must treat the instruction specially, or we
524 // must call `invalidate_regs_written_by_inst` on it.
525 match &dfg[inst] {
526 InstructionData::Unary {
527 opcode: Opcode::Spill,
528 arg: src_value,
529 } => {
530 // Extract: (src_reg, kind, offset, size)
531 // Invalidate: (kind, offset, size)
532 // Add new binding: {src_reg -> (kind, offset, size)}
533 // Don't forget that src_value might be diverted, so we have to deref it.
534 let slot = slot_of_value(locations, stack_slots, dfg.inst_results(inst)[0]);
535 let src_reg = diversions.reg(*src_value, locations);
536 let kind = slot.kind;
537 if is_slot_kind_tracked(kind) {
538 let offset = slot.offset.expect("visit_inst: spill with no offset");
539 let size = slot.size;
540 avail_env.bind(src_reg, kind, offset, size);
541 } else {
542 // We don't expect this insn to write any regs. But to be consistent with the
543 // rule above, do this anyway.
544 invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
545 }
546 }
547 InstructionData::Unary {
548 opcode: Opcode::Fill,
549 arg: src_value,
550 } => {
551 // Extract: (dst_reg, kind, offset, size)
552 // Invalidate: (kind, offset, size)
553 // Add new: {dst_reg -> (dst_value, kind, offset, size)}
554 let slot = slot_of_value(locations, stack_slots, *src_value);
555 let dst_value = dfg.inst_results(inst)[0];
556 let dst_reg = reg_of_value(locations, dst_value);
557 // This must be true. It would be meaningless for an SSA value to be diverted
558 // before it was defined.
559 debug_assert!(dst_reg == diversions.reg(dst_value, locations));
560 let kind = slot.kind;
561 if is_slot_kind_tracked(kind) {
562 let offset = slot.offset.expect("visit_inst: fill with no offset");
563 let size = slot.size;
564 if avail_env.has_exact_binding(dst_reg, kind, offset, size) {
565 // This instruction is an exact copy of a fill we saw earlier, and the
566 // loaded value is still valid. So we'll schedule this instruction for
567 // deletion (below). No need to make any changes to `avail_env`.
568 transform = Transform::ChangeToNopFill(*src_value);
569 } else if let Some(other_reg) =
570 avail_env.has_inexact_binding(reginfo, dst_reg, kind, offset, size)
571 {
572 // This fill is from the required slot, but into a different register
573 // `other_reg`. So replace it with a copy from `other_reg` to `dst_reg`
574 // and update `dst_reg`s binding to make it the same as `other_reg`'s, so
575 // as to maximise the chances of future matches after this instruction.
576 debug_assert!(other_reg != dst_reg);
577 transform =
578 Transform::ChangeToCopyToSSA(dfg.value_type(dst_value), other_reg);
579 avail_env.copy_reg(other_reg, dst_reg);
580 } else {
581 // This fill creates some new binding we don't know about. Update
582 // `avail_env` to track it.
583 avail_env.bind(dst_reg, kind, offset, size);
584 }
585 } else {
586 // Else it's "just another instruction that writes a reg", so we'd better
587 // treat it as such, just as we do below for instructions that we don't handle
588 // specially.
589 invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
590 }
591 }
592 InstructionData::RegMove { src, dst, .. } => {
593 // These happen relatively rarely, but just frequently enough that it's worth
594 // tracking the copy (at the machine level, it's really a copy) in `avail_env`.
595 avail_env.copy_reg(*src, *dst);
596 }
597 InstructionData::RegSpill { .. }
598 | InstructionData::RegFill { .. }
599 | InstructionData::Call { .. }
600 | InstructionData::CallIndirect { .. }
601 | InstructionData::StackLoad { .. }
602 | InstructionData::StackStore { .. }
603 | InstructionData::Unary {
604 opcode: Opcode::AdjustSpDown,
605 ..
606 }
607 | InstructionData::UnaryImm {
608 opcode: Opcode::AdjustSpUpImm,
609 ..
610 }
611 | InstructionData::UnaryImm {
612 opcode: Opcode::AdjustSpDownImm,
613 ..
614 } => {
615 // All of these change, or might change, the memory-register bindings tracked in
616 // `avail_env` in some way we don't know about, or at least, we might be able to
617 // track, but for which the effort-to-benefit ratio seems too low to bother. So
618 // play safe: forget everything we know.
619 //
620 // For Call/CallIndirect, we could do better when compiling for calling
621 // conventions that have callee-saved registers, since bindings for them would
622 // remain valid across the call.
623 avail_env.invalidate_all();
624 }
625 _ => {
626 // Invalidate: any `avail_env` entry associated with a reg written by `inst`.
627 invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
628 }
629 }
630
631 // Actually do the transformation.
632 match transform {
633 Transform::NoChange => {}
634 Transform::ChangeToNopFill(arg) => {
635 // Load is completely redundant. Convert it to a no-op.
636 dfg.replace(inst).fill_nop(arg);
637 let ok = func.update_encoding(inst, isa).is_ok();
638 debug_assert!(ok, "fill_nop encoding missing for this type");
639 }
640 Transform::ChangeToCopyToSSA(ty, reg) => {
641 // We already have the relevant value in some other register. Convert the
642 // load into a reg-reg copy.
643 dfg.replace(inst).copy_to_ssa(ty, reg);
644 let ok = func.update_encoding(inst, isa).is_ok();
645 debug_assert!(ok, "copy_to_ssa encoding missing for type {}", ty);
646 }
647 }
648 }
649 }
650
651 // =============================================================================================
652 // Top level: processing of tree shaped regions
653
654 impl RedundantReloadRemover {
655 // Push a clone of the top-of-stack ProcessingStackElem. This will be used to process exactly
656 // one Block. The diversions are created new, rather than cloned, to reflect the fact
657 // that diversions are local to each Block.
processing_stack_push(&mut self, cursor: CursorPosition)658 fn processing_stack_push(&mut self, cursor: CursorPosition) {
659 let avail_env = if let Some(stack_top) = self.processing_stack.last() {
660 stack_top.avail_env.clone()
661 } else {
662 AvailEnv::new(
663 self.num_regunits
664 .expect("processing_stack_push: num_regunits unknown!")
665 as usize,
666 )
667 };
668 self.processing_stack.push(ProcessingStackElem {
669 avail_env,
670 cursor,
671 diversions: RegDiversions::new(),
672 });
673 }
674
675 // This pushes the node `dst` onto the processing stack, and sets up the new
676 // ProcessingStackElem accordingly. But it does all that only if `dst` is part of the current
677 // tree *and* we haven't yet visited it.
processing_stack_maybe_push(&mut self, dst: Block)678 fn processing_stack_maybe_push(&mut self, dst: Block) {
679 if self.nodes_in_tree.contains(dst) && !self.nodes_already_visited.contains(dst) {
680 if !self.processing_stack.is_empty() {
681 // If this isn't the outermost node in the tree (that is, the root), then it must
682 // have exactly one predecessor. Nodes with no predecessors are dead and not
683 // incorporated in any tree. Nodes with two or more predecessors are the root of
684 // some other tree, and visiting them as if they were part of the current tree
685 // would be a serious error.
686 debug_assert!(self.num_preds_per_block[dst] == ZeroOneOrMany::One);
687 }
688 self.processing_stack_push(CursorPosition::Before(dst));
689 self.nodes_already_visited.insert(dst);
690 }
691 }
692
693 // Perform redundant-reload removal on the tree shaped region of graph defined by `root` and
694 // `self.nodes_in_tree`. The following state is modified: `self.processing_stack`,
695 // `self.nodes_already_visited`, and `func.dfg`.
process_tree( &mut self, func: &mut Function, reginfo: &RegInfo, isa: &dyn TargetIsa, root: Block, )696 fn process_tree(
697 &mut self,
698 func: &mut Function,
699 reginfo: &RegInfo,
700 isa: &dyn TargetIsa,
701 root: Block,
702 ) {
703 debug_assert!(self.nodes_in_tree.contains(root));
704 debug_assert!(self.processing_stack.is_empty());
705 debug_assert!(self.nodes_already_visited.is_empty());
706
707 // Create the initial work item
708 self.processing_stack_maybe_push(root);
709
710 while !self.processing_stack.is_empty() {
711 // It seems somewhat ridiculous to construct a whole new FuncCursor just so we can do
712 // next_inst() on it once, and then copy the resulting position back out. But use of
713 // a function-global FuncCursor, or of the EncCursor in struct Context, leads to
714 // borrow checker problems, as does including FuncCursor directly in
715 // ProcessingStackElem. In any case this is not as bad as it looks, since profiling
716 // shows that the build-insert-step-extract work is reduced to just 8 machine
717 // instructions in an optimised x86_64 build, presumably because rustc can inline and
718 // then optimise out almost all the work.
719 let tos = self.processing_stack.len() - 1;
720 let mut pos = FuncCursor::new(func).at_position(self.processing_stack[tos].cursor);
721 let maybe_inst = pos.next_inst();
722 self.processing_stack[tos].cursor = pos.position();
723
724 if let Some(inst) = maybe_inst {
725 // Deal with this insn, possibly changing it, possibly updating the top item of
726 // `self.processing_stack`.
727 self.visit_inst(func, reginfo, isa, inst);
728
729 // Update diversions after the insn.
730 self.processing_stack[tos].diversions.apply(&func.dfg[inst]);
731
732 // If the insn can branch outside this Block, push work items on the stack for all
733 // target Blocks that are part of the same tree and that we haven't yet visited.
734 // The next iteration of this instruction-processing loop will immediately start
735 // work on the most recently pushed Block, and will eventually continue in this Block
736 // when those new items have been removed from the stack.
737 match func.dfg.analyze_branch(inst) {
738 BranchInfo::NotABranch => (),
739 BranchInfo::SingleDest(dst, _) => {
740 self.processing_stack_maybe_push(dst);
741 }
742 BranchInfo::Table(jt, default) => {
743 func.jump_tables[jt]
744 .iter()
745 .for_each(|dst| self.processing_stack_maybe_push(*dst));
746 if let Some(dst) = default {
747 self.processing_stack_maybe_push(dst);
748 }
749 }
750 }
751 } else {
752 // We've come to the end of the current work-item (Block). We'll already have
753 // processed the fallthrough/continuation/whatever for it using the logic above.
754 // Pop it off the stack and resume work on its parent.
755 self.processing_stack.pop();
756 }
757 }
758 }
759 }
760
761 // =============================================================================================
762 // Top level: perform redundant fill removal for a complete function
763
764 impl RedundantReloadRemover {
765 /// Create a new remover state.
new() -> Self766 pub fn new() -> Self {
767 Self {
768 num_regunits: None,
769 num_preds_per_block: PrimaryMap::<Block, ZeroOneOrMany>::with_capacity(8),
770 discovery_stack: Vec::<Block>::with_capacity(16),
771 nodes_in_tree: EntitySet::<Block>::new(),
772 processing_stack: Vec::<ProcessingStackElem>::with_capacity(8),
773 nodes_already_visited: EntitySet::<Block>::new(),
774 }
775 }
776
777 /// Clear the state of the remover.
clear(&mut self)778 pub fn clear(&mut self) {
779 self.clear_for_new_function();
780 }
781
clear_for_new_function(&mut self)782 fn clear_for_new_function(&mut self) {
783 self.num_preds_per_block.clear();
784 self.clear_for_new_tree();
785 }
786
clear_for_new_tree(&mut self)787 fn clear_for_new_tree(&mut self) {
788 self.discovery_stack.clear();
789 self.nodes_in_tree.clear();
790 self.processing_stack.clear();
791 self.nodes_already_visited.clear();
792 }
793
794 #[inline(never)]
do_redundant_fill_removal_on_function( &mut self, func: &mut Function, reginfo: &RegInfo, isa: &dyn TargetIsa, cfg: &ControlFlowGraph, )795 fn do_redundant_fill_removal_on_function(
796 &mut self,
797 func: &mut Function,
798 reginfo: &RegInfo,
799 isa: &dyn TargetIsa,
800 cfg: &ControlFlowGraph,
801 ) {
802 // Fail in an obvious way if there are more than (2^32)-1 Blocks in this function.
803 let num_blocks: u32 = func.dfg.num_blocks().try_into().unwrap();
804
805 // Clear out per-tree state.
806 self.clear_for_new_function();
807
808 // Create a PrimaryMap that summarises the number of predecessors for each block, as 0, 1
809 // or "many", and that also claims the entry block as having "many" predecessors.
810 self.num_preds_per_block.clear();
811 self.num_preds_per_block.reserve(num_blocks as usize);
812
813 for i in 0..num_blocks {
814 let mut pi = cfg.pred_iter(Block::from_u32(i));
815 let mut n_pi = ZeroOneOrMany::Zero;
816 if pi.next().is_some() {
817 n_pi = ZeroOneOrMany::One;
818 if pi.next().is_some() {
819 n_pi = ZeroOneOrMany::Many;
820 // We don't care if there are more than two preds, so stop counting now.
821 }
822 }
823 self.num_preds_per_block.push(n_pi);
824 }
825 debug_assert!(self.num_preds_per_block.len() == num_blocks as usize);
826
827 // The entry block must be the root of some tree, so set up the state to reflect that.
828 let entry_block = func
829 .layout
830 .entry_block()
831 .expect("do_redundant_fill_removal_on_function: entry block unknown");
832 debug_assert!(self.num_preds_per_block[entry_block] == ZeroOneOrMany::Zero);
833 self.num_preds_per_block[entry_block] = ZeroOneOrMany::Many;
834
835 // Now build and process trees.
836 for root_ix in 0..self.num_preds_per_block.len() {
837 let root = Block::from_u32(root_ix as u32);
838
839 // Build a tree for each node that has two or more preds, and ignore all other nodes.
840 if self.num_preds_per_block[root] != ZeroOneOrMany::Many {
841 continue;
842 }
843
844 // Clear out per-tree state.
845 self.clear_for_new_tree();
846
847 // Discovery phase: build the tree, as `root` and `self.nodes_in_tree`.
848 self.add_nodes_to_tree(cfg, root);
849 debug_assert!(self.nodes_in_tree.cardinality() > 0);
850 debug_assert!(self.num_preds_per_block[root] == ZeroOneOrMany::Many);
851
852 // Processing phase: do redundant-reload-removal.
853 self.process_tree(func, reginfo, isa, root);
854 debug_assert!(
855 self.nodes_in_tree.cardinality() == self.nodes_already_visited.cardinality()
856 );
857 }
858 }
859 }
860
861 // =============================================================================================
862 // Top level: the external interface
863
864 struct Context<'a> {
865 // Current instruction as well as reference to function and ISA.
866 cur: EncCursor<'a>,
867
868 // Cached ISA information. We save it here to avoid frequent virtual function calls on the
869 // `TargetIsa` trait object.
870 reginfo: RegInfo,
871
872 // References to contextual data structures we need.
873 cfg: &'a ControlFlowGraph,
874
875 // The running state.
876 state: &'a mut RedundantReloadRemover,
877 }
878
879 impl RedundantReloadRemover {
880 /// Run the remover.
run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph)881 pub fn run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph) {
882 let ctx = Context {
883 cur: EncCursor::new(func, isa),
884 reginfo: isa.register_info(),
885 cfg,
886 state: self,
887 };
888 let mut total_regunits = 0;
889 for rb in isa.register_info().banks {
890 total_regunits += rb.units;
891 }
892 ctx.state.num_regunits = Some(total_regunits);
893 ctx.state.do_redundant_fill_removal_on_function(
894 ctx.cur.func,
895 &ctx.reginfo,
896 ctx.cur.isa,
897 &ctx.cfg,
898 );
899 }
900 }
901