1 //! This module implements a late-stage redundant-reload remover, which runs after registers have
2 //! been allocated and stack slots have been given specific offsets.
3 
4 use crate::cursor::{Cursor, CursorPosition, EncCursor, FuncCursor};
5 use crate::entity::EntitySet;
6 use crate::flowgraph::ControlFlowGraph;
7 use crate::ir::dfg::DataFlowGraph;
8 use crate::ir::instructions::BranchInfo;
9 use crate::ir::stackslot::{StackSlotKind, StackSlots};
10 use crate::ir::{
11     Block, Function, Inst, InstBuilder, InstructionData, Opcode, StackSlotData, Type, Value,
12     ValueLoc,
13 };
14 use crate::isa::{RegInfo, RegUnit, TargetIsa};
15 use crate::regalloc::RegDiversions;
16 use alloc::vec::Vec;
17 use core::convert::TryInto;
18 use cranelift_entity::{PrimaryMap, SecondaryMap};
19 
20 // =============================================================================================
21 // A description of the redundant-fill-removal algorithm
22 //
23 //
24 // The algorithm works forwards through each Block.  It carries along and updates a table,
25 // AvailEnv, with which it tracks registers that are known to have the same value as some stack
26 // slot.  The actions on encountering an instruction depend on the instruction, as follows:
27 //
28 // ss1 = spill r0: update the AvailEnv so as to note that slot `ss1` and register `r0`
29 //                 have the same value.
30 //
31 // r1 = fill ss0: look in the AvailEnv.  If it tells us that register `r1` and slot `ss0`
32 //                have the same value, then delete the instruction by converting it to a
33 //                `fill_nop`.
34 //
35 //                If it tells us that some other register `r2` has the same value as
36 //                slot `ss0`, convert the instruction into a copy from `r2` to `r1`.
37 //
38 // any other insn: remove from the AvailEnv, any bindings associated with registers
39 //                 written by this instruction, since they will be invalidated by it.
40 //
41 // Tracking the effects of `copy` instructions in AvailEnv for the case when both source and
42 // destination are registers does not cause any more fills to be removed or converted to copies.
43 // It's not clear why.
44 //
45 // There are various other instruction-handling cases in `visit_inst`, which are documented
46 // in-line, and do not change the core algorithm, so are not described here.
47 //
48 // The registers tracked by AvailEnv are the post-diversion registers that are really used by the
49 // code; they are not the pre-diversion names associated with each SSA `Value`.  The second
50 // `fill` case above opportunistically copies values from registers that may have been diversion
51 // targets in some predecessor block, and so are no longer associated with any specific SSA-level
52 // name at the point the copy is made.  Hence those copies (from `r2` to `r1`) cannot be done
53 // with an ordinary `copy` instruction.  Instead they have to be done using a new `copy_to_ssa`
54 // instruction, which copies from an arbitrary register to a register-resident `Value` (that is,
55 // "back to" SSA-world).
56 //
57 // That completes the description of the core algorithm.
58 //
59 // In the case where a block `A` jumps to `B` and `A` is the only predecessor of `B`, the
60 // AvailEnv at the end of `A` will still be valid at the entry to `B`.  In such a case, we can
61 // profitably transform `B` using the AvailEnv "inherited" from `A`.  In order to take full
62 // advantage of this, this module partitions the function's CFG into tree-shaped groups of
63 // blocks, and processes each tree as described above.  So the AvailEnv is only initialised to
64 // empty at the start of blocks that form the root of each tree; that is, for blocks which have
65 // two or more predecessors.
66 
67 // =============================================================================================
68 // Top level algorithm structure
69 //
70 // The overall algorithm, for a function, starts like this:
71 //
72 // * (once per function): finds Blocks that have two or more predecessors, since they will be the
73 //   roots of Block trees.  Also, the entry node for the function is considered to be a root.
74 //
75 // It then continues with a loop that first finds a tree of Blocks ("discovery") and then removes
76 // redundant fills as described above ("processing"):
77 //
78 // * (discovery; once per tree): for each root, performs a depth first search to find all the Blocks
79 //   in the tree, guided by RedundantReloadRemover::discovery_stack.
80 //
81 // * (processing; once per tree): the just-discovered tree is then processed as described above,
82 //   guided by RedundantReloadRemover::processing_stack.
83 //
84 // In this way, all Blocks reachable from the function's entry point are eventually processed.  Note
85 // that each tree is processed as soon as it has been discovered, so the algorithm never creates a
86 // list of trees for the function.
87 //
88 // The running state is stored in `RedundantReloadRemover`.  This is allocated once and can be
89 // reused for multiple functions so as to minimise heap turnover.  The fields are, roughly:
90 //
91 //   num_regunits -- constant for the whole function; used by the tree processing phase
92 //   num_preds_per_block -- constant for the whole function; used by the tree discovery process
93 //
94 //   discovery_stack -- used to guide the tree discovery process
95 //   nodes_in_tree -- the discovered nodes are recorded here
96 //
97 //   processing_stack -- used to guide the tree processing process
98 //   nodes_already_visited -- used to ensure the tree processing logic terminates in the case
99 //                            where a tree has a branch back to its root node.
100 //
101 // There is further documentation in line below, as appropriate.
102 
103 // =============================================================================================
104 // A side note on register choice heuristics
105 
106 // The core algorithm opportunistically replaces fill instructions when it knows of a register
107 // that already holds the required value.  How effective this is largely depends on how long
108 // reloaded values happen to stay alive before the relevant register is overwritten.  And that
109 // depends on the register allocator's register choice heuristics.  The worst case is, when the
110 // register allocator reuses registers as soon as possible after they become free.  Unfortunately
111 // that was indeed the selection scheme, prior to development of this pass.
112 //
113 // As part of this work, the register selection scheme has been changed as follows: for registers
114 // written by any instruction other than a fill, use the lowest numbered available register.  But
115 // for registers written by a fill instruction, use the highest numbered available register.  The
116 // aim is to try and keep reload- and non-reload registers disjoint to the extent possible.
117 // Several other schemes were tried, but this one is simple and can be worth an extra 2% of
118 // performance in some cases.
119 //
120 // The relevant change is more or less a one-line change in the solver.
121 
122 // =============================================================================================
123 // Data structures used for discovery of trees
124 
125 // `ZeroOneOrMany` is used to record the number of predecessors a Block block has.  The `Zero` case
126 // is included so as to cleanly handle the case where the incoming graph has unreachable Blocks.
127 
128 #[derive(Clone, PartialEq)]
129 enum ZeroOneOrMany {
130     Zero,
131     One,
132     Many,
133 }
134 
135 // =============================================================================================
136 // Data structures used for processing of trees
137 
138 // `SlotInfo` describes a spill slot in the obvious way.  Note that it doesn't indicate which
139 // register(s) are currently associated with the slot.  That job is done by `AvailEnv` instead.
140 //
141 // In the CL framework, stack slots are partitioned into disjoint sets, one for each
142 // `StackSlotKind`.  The offset and size only give a unique identity within any particular
143 // `StackSlotKind`.  So, to uniquely identify a stack slot, all three fields are necessary.
144 
145 #[derive(Clone, Copy)]
146 struct SlotInfo {
147     kind: StackSlotKind,
148     offset: i32,
149     size: u32,
150 }
151 
152 // `AvailEnv` maps each possible register to a stack slot that holds the same value.  The index
153 // space of `AvailEnv::map` is exactly the set of registers available on the current target.  If
154 // (as is mostly the case) a register is not known to have the same value as a stack slot, then
155 // its entry is `None` rather than `Some(..)`.
156 //
157 // Invariants for AvailEnv:
158 //
159 // AvailEnv may have multiple different registers bound to the same stack slot -- that is, `(kind,
160 // offset, size)` triple.  That's OK, and reflects the reality that those two registers contain
161 // the same value.  This could happen, for example, in the case
162 //
163 //   ss1 = spill r0
164 //   ..
165 //   r2 = fill ss1
166 //
167 // Then both `r0` and `r2` will have the same value as `ss1`, provided that ".." doesn't write to
168 // `r1`.
169 //
170 // To say that two different registers may be bound to the same stack slot is the same as saying
171 // that it is allowed to have two different entries in AvailEnv with the same `(kind, offset,
172 // size)` triple.  What is *not* allowed is to have partial overlaps.  That is, if two SlotInfos
173 // have the same `kind` field and have `offset` and `size` fields that overlap, then their
174 // `offset` and `size` fields must be identical.  This is so as to make the algorithm safe against
175 // situations where, for example, a 64 bit register is spilled, but then only the bottom 32 bits
176 // are reloaded from the slot.
177 //
178 // Although in such a case it seems likely that the Cranelift IR would be ill-typed, and so this
179 // case could probably not occur in practice.
180 
181 #[derive(Clone)]
182 struct AvailEnv {
183     map: Vec<Option<SlotInfo>>,
184 }
185 
186 // `ProcessingStackElem` combines AvailEnv with contextual information needed to "navigate" within
187 // a Block.
188 //
189 // A ProcessingStackElem conceptually has the lifetime of exactly one Block: once the current Block is
190 // completed, the ProcessingStackElem will be abandoned.  In practice the top level state,
191 // RedundantReloadRemover, caches them, so as to avoid heap turnover.
192 //
193 // Note that ProcessingStackElem must contain a CursorPosition.  The CursorPosition, which
194 // indicates where we are in the current Block, cannot be implicitly maintained by looping over all
195 // the instructions in a Block in turn, because we may choose to suspend processing the current Block
196 // at a side exit, continue by processing the subtree reached via the side exit, and only later
197 // resume the current Block.
198 
199 struct ProcessingStackElem {
200     /// Indicates the AvailEnv at the current point in the Block.
201     avail_env: AvailEnv,
202 
203     /// Shows where we currently are inside the Block.
204     cursor: CursorPosition,
205 
206     /// Indicates the currently active register diversions at the current point.
207     diversions: RegDiversions,
208 }
209 
210 // =============================================================================================
211 // The top level data structure
212 
213 // `RedundantReloadRemover` contains data structures for the two passes: discovery of tree shaped
214 // regions, and processing of them.  These are allocated once and stay alive for the entire
215 // function, even though they are cleared out for each new tree shaped region.  It also caches
216 // `num_regunits` and `num_preds_per_block`, which are computed at the start of each function and
217 // then remain constant.
218 
219 /// The redundant reload remover's state.
220 pub struct RedundantReloadRemover {
221     /// The total number of RegUnits available on this architecture.  This is unknown when the
222     /// RedundantReloadRemover is created.  It becomes known at the beginning of processing of a
223     /// function.
224     num_regunits: Option<u16>,
225 
226     /// This stores, for each Block, a characterisation of the number of predecessors it has.
227     num_preds_per_block: PrimaryMap<Block, ZeroOneOrMany>,
228 
229     /// The stack used for the first phase (discovery).  There is one element on the discovery
230     /// stack for each currently unexplored Block in the tree being searched.
231     discovery_stack: Vec<Block>,
232 
233     /// The nodes in the discovered tree are inserted here.
234     nodes_in_tree: EntitySet<Block>,
235 
236     /// The stack used during the second phase (transformation).  There is one element on the
237     /// processing stack for each currently-open node in the tree being transformed.
238     processing_stack: Vec<ProcessingStackElem>,
239 
240     /// Used in the second phase to avoid visiting nodes more than once.
241     nodes_already_visited: EntitySet<Block>,
242 }
243 
244 // =============================================================================================
245 // Miscellaneous small helper functions
246 
247 // Is this a kind of stack slot that is safe to track in AvailEnv?  This is probably overly
248 // conservative, but tracking only the SpillSlot and IncomingArgument kinds catches almost all
249 // available redundancy in practice.
is_slot_kind_tracked(kind: StackSlotKind) -> bool250 fn is_slot_kind_tracked(kind: StackSlotKind) -> bool {
251     match kind {
252         StackSlotKind::SpillSlot | StackSlotKind::IncomingArg => true,
253         _ => false,
254     }
255 }
256 
257 // Find out if the range `[offset, +size)` overlaps with the range in `si`.
overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool258 fn overlaps(si: &SlotInfo, offset: i32, size: u32) -> bool {
259     let a_offset = si.offset as i64;
260     let a_size = si.size as i64;
261     let b_offset = offset as i64;
262     let b_size = size as i64;
263     let no_overlap = a_offset + a_size <= b_offset || b_offset + b_size <= a_offset;
264     !no_overlap
265 }
266 
267 // Find, in `reginfo`, the register bank that `reg` lives in, and return the lower limit and size
268 // of the bank.  This is so the caller can conveniently iterate over all RegUnits in the bank that
269 // `reg` lives in.
find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16)270 fn find_bank_limits(reginfo: &RegInfo, reg: RegUnit) -> (RegUnit, u16) {
271     if let Some(bank) = reginfo.bank_containing_regunit(reg) {
272         return (bank.first_unit, bank.units);
273     }
274     // We should never get here, since `reg` must come from *some* RegBank.
275     panic!("find_regclass_limits: reg not found");
276 }
277 
278 // Returns the register that `v` is allocated to.  Assumes that `v` actually resides in a
279 // register.
reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit280 fn reg_of_value(locations: &SecondaryMap<Value, ValueLoc>, v: Value) -> RegUnit {
281     match locations[v] {
282         ValueLoc::Reg(ru) => ru,
283         _ => panic!("reg_of_value: value isn't in a reg"),
284     }
285 }
286 
287 // Returns the stack slot that `v` is allocated to.  Assumes that `v` actually resides in a stack
288 // slot.
slot_of_value<'s>( locations: &SecondaryMap<Value, ValueLoc>, stack_slots: &'s StackSlots, v: Value, ) -> &'s StackSlotData289 fn slot_of_value<'s>(
290     locations: &SecondaryMap<Value, ValueLoc>,
291     stack_slots: &'s StackSlots,
292     v: Value,
293 ) -> &'s StackSlotData {
294     match locations[v] {
295         ValueLoc::Stack(slot) => &stack_slots[slot],
296         _ => panic!("slot_of_value: value isn't in a stack slot"),
297     }
298 }
299 
300 // =============================================================================================
301 // Top level: discovery of tree shaped regions
302 
303 impl RedundantReloadRemover {
304     // A helper for `add_nodes_to_tree` below.
discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Block)305     fn discovery_stack_push_successors_of(&mut self, cfg: &ControlFlowGraph, node: Block) {
306         for successor in cfg.succ_iter(node) {
307             self.discovery_stack.push(successor);
308         }
309     }
310 
311     // Visit the tree of Blocks rooted at `starting_point` and add them to `self.nodes_in_tree`.
312     // `self.num_preds_per_block` guides the process, ensuring we don't leave the tree-ish region
313     // and indirectly ensuring that the process will terminate in the presence of cycles in the
314     // graph.  `self.discovery_stack` holds the search state in this function.
add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Block)315     fn add_nodes_to_tree(&mut self, cfg: &ControlFlowGraph, starting_point: Block) {
316         // One might well ask why this doesn't loop forever when it encounters cycles in the
317         // control flow graph.  The reason is that any cycle in the graph that is reachable from
318         // anywhere outside the cycle -- in particular, that is reachable from the function's
319         // entry node -- must have at least one node that has two or more predecessors.  So the
320         // logic below won't follow into it, because it regards any such node as the root of some
321         // other tree.
322         debug_assert!(self.discovery_stack.is_empty());
323         debug_assert!(self.nodes_in_tree.is_empty());
324 
325         self.nodes_in_tree.insert(starting_point);
326         self.discovery_stack_push_successors_of(cfg, starting_point);
327 
328         while let Some(node) = self.discovery_stack.pop() {
329             match self.num_preds_per_block[node] {
330                 // We arrived at a node with multiple predecessors, so it's a new root.  Ignore it.
331                 ZeroOneOrMany::Many => {}
332                 // This node has just one predecessor, so we should incorporate it in the tree and
333                 // immediately transition into searching from it instead.
334                 ZeroOneOrMany::One => {
335                     self.nodes_in_tree.insert(node);
336                     self.discovery_stack_push_successors_of(cfg, node);
337                 }
338                 // This is meaningless.  We arrived at a node that doesn't point back at where we
339                 // came from.
340                 ZeroOneOrMany::Zero => panic!("add_nodes_to_tree: inconsistent graph"),
341             }
342         }
343     }
344 }
345 
346 // =============================================================================================
347 // Operations relating to `AvailEnv`
348 
349 impl AvailEnv {
350     // Create a new one.
new(size: usize) -> Self351     fn new(size: usize) -> Self {
352         let mut env = Self {
353             map: Vec::<Option<SlotInfo>>::new(),
354         };
355         env.map.resize(size, None);
356         env
357     }
358 
359     // Debug only: checks (some of) the required AvailEnv invariants.
360     #[cfg(debug_assertions)]
check_invariants(&self) -> bool361     fn check_invariants(&self) -> bool {
362         // Check that any overlapping entries overlap exactly.  This is super lame (quadratic),
363         // but it's only used in debug builds.
364         for i in 0..self.map.len() {
365             if let Some(si) = self.map[i] {
366                 for j in i + 1..self.map.len() {
367                     if let Some(sj) = self.map[j] {
368                         // "si and sj overlap, but not exactly"
369                         if si.kind == sj.kind
370                             && overlaps(&si, sj.offset, sj.size)
371                             && !(si.offset == sj.offset && si.size == sj.size)
372                         {
373                             return false;
374                         }
375                     }
376                 }
377             }
378         }
379         true
380     }
381 
382     // Invalidates the binding associated with `reg`.  Note that by construction of AvailEnv,
383     // `reg` can only be associated with one binding at once.
invalidate_by_reg(&mut self, reg: RegUnit)384     fn invalidate_by_reg(&mut self, reg: RegUnit) {
385         self.map[reg as usize] = None;
386     }
387 
388     // Invalidates any binding that has any overlap with `(kind, offset, size)`.
invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32)389     fn invalidate_by_offset(&mut self, kind: StackSlotKind, offset: i32, size: u32) {
390         debug_assert!(is_slot_kind_tracked(kind));
391         for i in 0..self.map.len() {
392             if let Some(si) = &self.map[i] {
393                 if si.kind == kind && overlaps(&si, offset, size) {
394                     self.map[i] = None;
395                 }
396             }
397         }
398     }
399 
400     // Invalidates all bindings.
invalidate_all(&mut self)401     fn invalidate_all(&mut self) {
402         for i in 0..self.map.len() {
403             self.map[i] = None;
404         }
405     }
406 
407     // Updates AvailEnv to track the effect of a `regmove` instruction.
copy_reg(&mut self, src: RegUnit, dst: RegUnit)408     fn copy_reg(&mut self, src: RegUnit, dst: RegUnit) {
409         self.map[dst as usize] = self.map[src as usize];
410     }
411 
412     // Does `env` have the exact binding characterised by `(reg, kind, offset, size)` ?
has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool413     fn has_exact_binding(&self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) -> bool {
414         debug_assert!(is_slot_kind_tracked(kind));
415         if let Some(si) = &self.map[reg as usize] {
416             return si.kind == kind && si.offset == offset && si.size == size;
417         }
418         // No such binding.
419         false
420     }
421 
422     // Does `env` have a binding characterised by `(kind, offset, size)` but to a register, let's
423     // call it `other_reg`, that isn't `reg`?  If so, return `other_reg`.  Note that `other_reg`
424     // will have the same bank as `reg`.  It is a checked error to call this function with a
425     // binding matching all four of `(reg, kind, offset, size)`.
has_inexact_binding( &self, reginfo: &RegInfo, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32, ) -> Option<RegUnit>426     fn has_inexact_binding(
427         &self,
428         reginfo: &RegInfo,
429         reg: RegUnit,
430         kind: StackSlotKind,
431         offset: i32,
432         size: u32,
433     ) -> Option<RegUnit> {
434         debug_assert!(is_slot_kind_tracked(kind));
435         // Find the range of RegUnit numbers for the bank that contains `reg`, and use that as our
436         // search space.  This is so as to guarantee that any match is restricted to the same bank
437         // as `reg`.
438         let (first_unit, num_units) = find_bank_limits(reginfo, reg);
439         for other_reg in first_unit..first_unit + num_units {
440             if let Some(si) = &self.map[other_reg as usize] {
441                 if si.kind == kind && si.offset == offset && si.size == size {
442                     if other_reg == reg {
443                         panic!("has_inexact_binding: binding *is* exact!");
444                     }
445                     return Some(other_reg);
446                 }
447             }
448         }
449         // No such binding.
450         None
451     }
452 
453     // Create the binding `(reg, kind, offset, size)` in `env`, and throw away any previous
454     // binding associated with either `reg` or the `(kind, offset, size)` triple.
bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32)455     fn bind(&mut self, reg: RegUnit, kind: StackSlotKind, offset: i32, size: u32) {
456         debug_assert!(is_slot_kind_tracked(kind));
457         self.invalidate_by_offset(kind, offset, size);
458         self.map[reg as usize] = Some(SlotInfo { kind, offset, size });
459     }
460 }
461 
462 // Invalidates in `avail_env`, any binding associated with a regunit that is written by `inst`.
invalidate_regs_written_by_inst( locations: &SecondaryMap<Value, ValueLoc>, diversions: &RegDiversions, dfg: &DataFlowGraph, avail_env: &mut AvailEnv, inst: Inst, )463 fn invalidate_regs_written_by_inst(
464     locations: &SecondaryMap<Value, ValueLoc>,
465     diversions: &RegDiversions,
466     dfg: &DataFlowGraph,
467     avail_env: &mut AvailEnv,
468     inst: Inst,
469 ) {
470     for v in dfg.inst_results(inst).iter() {
471         if let ValueLoc::Reg(ru) = locations[*v] {
472             // This must be true.  It would be meaningless for an SSA value to be diverted before
473             // the point where it is defined.
474             debug_assert!(diversions.reg(*v, locations) == ru);
475             avail_env.invalidate_by_reg(ru);
476         }
477     }
478 }
479 
480 // =============================================================================================
481 // Processing of individual instructions
482 
483 impl RedundantReloadRemover {
484     // Process `inst`, possibly changing it into a different instruction, and possibly changing
485     // `self.avail_env` and `func.dfg`.
visit_inst( &mut self, func: &mut Function, reginfo: &RegInfo, isa: &dyn TargetIsa, inst: Inst, )486     fn visit_inst(
487         &mut self,
488         func: &mut Function,
489         reginfo: &RegInfo,
490         isa: &dyn TargetIsa,
491         inst: Inst,
492     ) {
493         // Get hold of the top-of-stack work item.  This is the state that we will mutate during
494         // processing of this instruction.
495         debug_assert!(!self.processing_stack.is_empty());
496         let ProcessingStackElem {
497             avail_env,
498             diversions,
499             ..
500         } = self.processing_stack.last_mut().unwrap();
501 
502         #[cfg(debug_assertions)]
503         debug_assert!(
504             avail_env.check_invariants(),
505             "visit_inst: env invariants not ok"
506         );
507 
508         let dfg = &mut func.dfg;
509         let locations = &func.locations;
510         let stack_slots = &func.stack_slots;
511 
512         // To avoid difficulties with the borrow checker, do this in two stages.  First, examine
513         // the instruction to see if it can be deleted or modified, and park the relevant
514         // information in `transform`.  Update `self.avail_env` too.  Later, use `transform` to
515         // actually do the transformation if necessary.
516         enum Transform {
517             NoChange,
518             ChangeToNopFill(Value),           // delete this insn entirely
519             ChangeToCopyToSSA(Type, RegUnit), // change it into a copy from the specified reg
520         }
521         let mut transform = Transform::NoChange;
522 
523         // In this match { .. } statement, either we must treat the instruction specially, or we
524         // must call `invalidate_regs_written_by_inst` on it.
525         match &dfg[inst] {
526             InstructionData::Unary {
527                 opcode: Opcode::Spill,
528                 arg: src_value,
529             } => {
530                 // Extract: (src_reg, kind, offset, size)
531                 // Invalidate: (kind, offset, size)
532                 // Add new binding: {src_reg -> (kind, offset, size)}
533                 // Don't forget that src_value might be diverted, so we have to deref it.
534                 let slot = slot_of_value(locations, stack_slots, dfg.inst_results(inst)[0]);
535                 let src_reg = diversions.reg(*src_value, locations);
536                 let kind = slot.kind;
537                 if is_slot_kind_tracked(kind) {
538                     let offset = slot.offset.expect("visit_inst: spill with no offset");
539                     let size = slot.size;
540                     avail_env.bind(src_reg, kind, offset, size);
541                 } else {
542                     // We don't expect this insn to write any regs.  But to be consistent with the
543                     // rule above, do this anyway.
544                     invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
545                 }
546             }
547             InstructionData::Unary {
548                 opcode: Opcode::Fill,
549                 arg: src_value,
550             } => {
551                 // Extract: (dst_reg, kind, offset, size)
552                 // Invalidate: (kind, offset, size)
553                 // Add new: {dst_reg -> (dst_value, kind, offset, size)}
554                 let slot = slot_of_value(locations, stack_slots, *src_value);
555                 let dst_value = dfg.inst_results(inst)[0];
556                 let dst_reg = reg_of_value(locations, dst_value);
557                 // This must be true.  It would be meaningless for an SSA value to be diverted
558                 // before it was defined.
559                 debug_assert!(dst_reg == diversions.reg(dst_value, locations));
560                 let kind = slot.kind;
561                 if is_slot_kind_tracked(kind) {
562                     let offset = slot.offset.expect("visit_inst: fill with no offset");
563                     let size = slot.size;
564                     if avail_env.has_exact_binding(dst_reg, kind, offset, size) {
565                         // This instruction is an exact copy of a fill we saw earlier, and the
566                         // loaded value is still valid.  So we'll schedule this instruction for
567                         // deletion (below).  No need to make any changes to `avail_env`.
568                         transform = Transform::ChangeToNopFill(*src_value);
569                     } else if let Some(other_reg) =
570                         avail_env.has_inexact_binding(reginfo, dst_reg, kind, offset, size)
571                     {
572                         // This fill is from the required slot, but into a different register
573                         // `other_reg`.  So replace it with a copy from `other_reg` to `dst_reg`
574                         // and update `dst_reg`s binding to make it the same as `other_reg`'s, so
575                         // as to maximise the chances of future matches after this instruction.
576                         debug_assert!(other_reg != dst_reg);
577                         transform =
578                             Transform::ChangeToCopyToSSA(dfg.value_type(dst_value), other_reg);
579                         avail_env.copy_reg(other_reg, dst_reg);
580                     } else {
581                         // This fill creates some new binding we don't know about.  Update
582                         // `avail_env` to track it.
583                         avail_env.bind(dst_reg, kind, offset, size);
584                     }
585                 } else {
586                     // Else it's "just another instruction that writes a reg", so we'd better
587                     // treat it as such, just as we do below for instructions that we don't handle
588                     // specially.
589                     invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
590                 }
591             }
592             InstructionData::RegMove { src, dst, .. } => {
593                 // These happen relatively rarely, but just frequently enough that it's worth
594                 // tracking the copy (at the machine level, it's really a copy) in `avail_env`.
595                 avail_env.copy_reg(*src, *dst);
596             }
597             InstructionData::RegSpill { .. }
598             | InstructionData::RegFill { .. }
599             | InstructionData::Call { .. }
600             | InstructionData::CallIndirect { .. }
601             | InstructionData::StackLoad { .. }
602             | InstructionData::StackStore { .. }
603             | InstructionData::Unary {
604                 opcode: Opcode::AdjustSpDown,
605                 ..
606             }
607             | InstructionData::UnaryImm {
608                 opcode: Opcode::AdjustSpUpImm,
609                 ..
610             }
611             | InstructionData::UnaryImm {
612                 opcode: Opcode::AdjustSpDownImm,
613                 ..
614             } => {
615                 // All of these change, or might change, the memory-register bindings tracked in
616                 // `avail_env` in some way we don't know about, or at least, we might be able to
617                 // track, but for which the effort-to-benefit ratio seems too low to bother.  So
618                 // play safe: forget everything we know.
619                 //
620                 // For Call/CallIndirect, we could do better when compiling for calling
621                 // conventions that have callee-saved registers, since bindings for them would
622                 // remain valid across the call.
623                 avail_env.invalidate_all();
624             }
625             _ => {
626                 // Invalidate: any `avail_env` entry associated with a reg written by `inst`.
627                 invalidate_regs_written_by_inst(locations, diversions, dfg, avail_env, inst);
628             }
629         }
630 
631         // Actually do the transformation.
632         match transform {
633             Transform::NoChange => {}
634             Transform::ChangeToNopFill(arg) => {
635                 // Load is completely redundant.  Convert it to a no-op.
636                 dfg.replace(inst).fill_nop(arg);
637                 let ok = func.update_encoding(inst, isa).is_ok();
638                 debug_assert!(ok, "fill_nop encoding missing for this type");
639             }
640             Transform::ChangeToCopyToSSA(ty, reg) => {
641                 // We already have the relevant value in some other register.  Convert the
642                 // load into a reg-reg copy.
643                 dfg.replace(inst).copy_to_ssa(ty, reg);
644                 let ok = func.update_encoding(inst, isa).is_ok();
645                 debug_assert!(ok, "copy_to_ssa encoding missing for type {}", ty);
646             }
647         }
648     }
649 }
650 
651 // =============================================================================================
652 // Top level: processing of tree shaped regions
653 
654 impl RedundantReloadRemover {
655     // Push a clone of the top-of-stack ProcessingStackElem.  This will be used to process exactly
656     // one Block.  The diversions are created new, rather than cloned, to reflect the fact
657     // that diversions are local to each Block.
processing_stack_push(&mut self, cursor: CursorPosition)658     fn processing_stack_push(&mut self, cursor: CursorPosition) {
659         let avail_env = if let Some(stack_top) = self.processing_stack.last() {
660             stack_top.avail_env.clone()
661         } else {
662             AvailEnv::new(
663                 self.num_regunits
664                     .expect("processing_stack_push: num_regunits unknown!")
665                     as usize,
666             )
667         };
668         self.processing_stack.push(ProcessingStackElem {
669             avail_env,
670             cursor,
671             diversions: RegDiversions::new(),
672         });
673     }
674 
675     // This pushes the node `dst` onto the processing stack, and sets up the new
676     // ProcessingStackElem accordingly.  But it does all that only if `dst` is part of the current
677     // tree *and* we haven't yet visited it.
processing_stack_maybe_push(&mut self, dst: Block)678     fn processing_stack_maybe_push(&mut self, dst: Block) {
679         if self.nodes_in_tree.contains(dst) && !self.nodes_already_visited.contains(dst) {
680             if !self.processing_stack.is_empty() {
681                 // If this isn't the outermost node in the tree (that is, the root), then it must
682                 // have exactly one predecessor.  Nodes with no predecessors are dead and not
683                 // incorporated in any tree.  Nodes with two or more predecessors are the root of
684                 // some other tree, and visiting them as if they were part of the current tree
685                 // would be a serious error.
686                 debug_assert!(self.num_preds_per_block[dst] == ZeroOneOrMany::One);
687             }
688             self.processing_stack_push(CursorPosition::Before(dst));
689             self.nodes_already_visited.insert(dst);
690         }
691     }
692 
693     // Perform redundant-reload removal on the tree shaped region of graph defined by `root` and
694     // `self.nodes_in_tree`.  The following state is modified: `self.processing_stack`,
695     // `self.nodes_already_visited`, and `func.dfg`.
process_tree( &mut self, func: &mut Function, reginfo: &RegInfo, isa: &dyn TargetIsa, root: Block, )696     fn process_tree(
697         &mut self,
698         func: &mut Function,
699         reginfo: &RegInfo,
700         isa: &dyn TargetIsa,
701         root: Block,
702     ) {
703         debug_assert!(self.nodes_in_tree.contains(root));
704         debug_assert!(self.processing_stack.is_empty());
705         debug_assert!(self.nodes_already_visited.is_empty());
706 
707         // Create the initial work item
708         self.processing_stack_maybe_push(root);
709 
710         while !self.processing_stack.is_empty() {
711             // It seems somewhat ridiculous to construct a whole new FuncCursor just so we can do
712             // next_inst() on it once, and then copy the resulting position back out.  But use of
713             // a function-global FuncCursor, or of the EncCursor in struct Context, leads to
714             // borrow checker problems, as does including FuncCursor directly in
715             // ProcessingStackElem.  In any case this is not as bad as it looks, since profiling
716             // shows that the build-insert-step-extract work is reduced to just 8 machine
717             // instructions in an optimised x86_64 build, presumably because rustc can inline and
718             // then optimise out almost all the work.
719             let tos = self.processing_stack.len() - 1;
720             let mut pos = FuncCursor::new(func).at_position(self.processing_stack[tos].cursor);
721             let maybe_inst = pos.next_inst();
722             self.processing_stack[tos].cursor = pos.position();
723 
724             if let Some(inst) = maybe_inst {
725                 // Deal with this insn, possibly changing it, possibly updating the top item of
726                 // `self.processing_stack`.
727                 self.visit_inst(func, reginfo, isa, inst);
728 
729                 // Update diversions after the insn.
730                 self.processing_stack[tos].diversions.apply(&func.dfg[inst]);
731 
732                 // If the insn can branch outside this Block, push work items on the stack for all
733                 // target Blocks that are part of the same tree and that we haven't yet visited.
734                 // The next iteration of this instruction-processing loop will immediately start
735                 // work on the most recently pushed Block, and will eventually continue in this Block
736                 // when those new items have been removed from the stack.
737                 match func.dfg.analyze_branch(inst) {
738                     BranchInfo::NotABranch => (),
739                     BranchInfo::SingleDest(dst, _) => {
740                         self.processing_stack_maybe_push(dst);
741                     }
742                     BranchInfo::Table(jt, default) => {
743                         func.jump_tables[jt]
744                             .iter()
745                             .for_each(|dst| self.processing_stack_maybe_push(*dst));
746                         if let Some(dst) = default {
747                             self.processing_stack_maybe_push(dst);
748                         }
749                     }
750                 }
751             } else {
752                 // We've come to the end of the current work-item (Block).  We'll already have
753                 // processed the fallthrough/continuation/whatever for it using the logic above.
754                 // Pop it off the stack and resume work on its parent.
755                 self.processing_stack.pop();
756             }
757         }
758     }
759 }
760 
761 // =============================================================================================
762 // Top level: perform redundant fill removal for a complete function
763 
764 impl RedundantReloadRemover {
765     /// Create a new remover state.
new() -> Self766     pub fn new() -> Self {
767         Self {
768             num_regunits: None,
769             num_preds_per_block: PrimaryMap::<Block, ZeroOneOrMany>::with_capacity(8),
770             discovery_stack: Vec::<Block>::with_capacity(16),
771             nodes_in_tree: EntitySet::<Block>::new(),
772             processing_stack: Vec::<ProcessingStackElem>::with_capacity(8),
773             nodes_already_visited: EntitySet::<Block>::new(),
774         }
775     }
776 
777     /// Clear the state of the remover.
clear(&mut self)778     pub fn clear(&mut self) {
779         self.clear_for_new_function();
780     }
781 
clear_for_new_function(&mut self)782     fn clear_for_new_function(&mut self) {
783         self.num_preds_per_block.clear();
784         self.clear_for_new_tree();
785     }
786 
clear_for_new_tree(&mut self)787     fn clear_for_new_tree(&mut self) {
788         self.discovery_stack.clear();
789         self.nodes_in_tree.clear();
790         self.processing_stack.clear();
791         self.nodes_already_visited.clear();
792     }
793 
794     #[inline(never)]
do_redundant_fill_removal_on_function( &mut self, func: &mut Function, reginfo: &RegInfo, isa: &dyn TargetIsa, cfg: &ControlFlowGraph, )795     fn do_redundant_fill_removal_on_function(
796         &mut self,
797         func: &mut Function,
798         reginfo: &RegInfo,
799         isa: &dyn TargetIsa,
800         cfg: &ControlFlowGraph,
801     ) {
802         // Fail in an obvious way if there are more than (2^32)-1 Blocks in this function.
803         let num_blocks: u32 = func.dfg.num_blocks().try_into().unwrap();
804 
805         // Clear out per-tree state.
806         self.clear_for_new_function();
807 
808         // Create a PrimaryMap that summarises the number of predecessors for each block, as 0, 1
809         // or "many", and that also claims the entry block as having "many" predecessors.
810         self.num_preds_per_block.clear();
811         self.num_preds_per_block.reserve(num_blocks as usize);
812 
813         for i in 0..num_blocks {
814             let mut pi = cfg.pred_iter(Block::from_u32(i));
815             let mut n_pi = ZeroOneOrMany::Zero;
816             if pi.next().is_some() {
817                 n_pi = ZeroOneOrMany::One;
818                 if pi.next().is_some() {
819                     n_pi = ZeroOneOrMany::Many;
820                     // We don't care if there are more than two preds, so stop counting now.
821                 }
822             }
823             self.num_preds_per_block.push(n_pi);
824         }
825         debug_assert!(self.num_preds_per_block.len() == num_blocks as usize);
826 
827         // The entry block must be the root of some tree, so set up the state to reflect that.
828         let entry_block = func
829             .layout
830             .entry_block()
831             .expect("do_redundant_fill_removal_on_function: entry block unknown");
832         debug_assert!(self.num_preds_per_block[entry_block] == ZeroOneOrMany::Zero);
833         self.num_preds_per_block[entry_block] = ZeroOneOrMany::Many;
834 
835         // Now build and process trees.
836         for root_ix in 0..self.num_preds_per_block.len() {
837             let root = Block::from_u32(root_ix as u32);
838 
839             // Build a tree for each node that has two or more preds, and ignore all other nodes.
840             if self.num_preds_per_block[root] != ZeroOneOrMany::Many {
841                 continue;
842             }
843 
844             // Clear out per-tree state.
845             self.clear_for_new_tree();
846 
847             // Discovery phase: build the tree, as `root` and `self.nodes_in_tree`.
848             self.add_nodes_to_tree(cfg, root);
849             debug_assert!(self.nodes_in_tree.cardinality() > 0);
850             debug_assert!(self.num_preds_per_block[root] == ZeroOneOrMany::Many);
851 
852             // Processing phase: do redundant-reload-removal.
853             self.process_tree(func, reginfo, isa, root);
854             debug_assert!(
855                 self.nodes_in_tree.cardinality() == self.nodes_already_visited.cardinality()
856             );
857         }
858     }
859 }
860 
861 // =============================================================================================
862 // Top level: the external interface
863 
864 struct Context<'a> {
865     // Current instruction as well as reference to function and ISA.
866     cur: EncCursor<'a>,
867 
868     // Cached ISA information.  We save it here to avoid frequent virtual function calls on the
869     // `TargetIsa` trait object.
870     reginfo: RegInfo,
871 
872     // References to contextual data structures we need.
873     cfg: &'a ControlFlowGraph,
874 
875     // The running state.
876     state: &'a mut RedundantReloadRemover,
877 }
878 
879 impl RedundantReloadRemover {
880     /// Run the remover.
run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph)881     pub fn run(&mut self, isa: &dyn TargetIsa, func: &mut Function, cfg: &ControlFlowGraph) {
882         let ctx = Context {
883             cur: EncCursor::new(func, isa),
884             reginfo: isa.register_info(),
885             cfg,
886             state: self,
887         };
888         let mut total_regunits = 0;
889         for rb in isa.register_info().banks {
890             total_regunits += rb.units;
891         }
892         ctx.state.num_regunits = Some(total_regunits);
893         ctx.state.do_redundant_fill_removal_on_function(
894             ctx.cur.func,
895             &ctx.reginfo,
896             ctx.cur.isa,
897             &ctx.cfg,
898         );
899     }
900 }
901