1 // Copyright (c) 2018 Google LLC.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "source/opt/loop_fission.h"
16 
17 #include <set>
18 
19 #include "source/opt/register_pressure.h"
20 
21 // Implement loop fission with an optional parameter to split only
22 // if the register pressure in a given loop meets a certain criteria. This is
23 // controlled via the constructors of LoopFissionPass.
24 //
25 // 1 - Build a list of loops to be split, these are top level loops (loops
26 // without child loops themselves) which meet the register pressure criteria, as
27 // determined by the ShouldSplitLoop method of LoopFissionPass.
28 //
29 // 2 - For each loop in the list, group each instruction into a set of related
30 // instructions by traversing each instructions users and operands recursively.
31 // We stop if we encounter an instruction we have seen before or an instruction
32 // which we don't consider relevent (i.e OpLoopMerge). We then group these
33 // groups into two different sets, one for the first loop and one for the
34 // second.
35 //
36 // 3 - We then run CanPerformSplit to check that it would be legal to split a
37 // loop using those two sets. We check that we haven't altered the relative
38 // order load/stores appear in the binary and that we aren't breaking any
39 // dependency between load/stores by splitting them into two loops. We also
40 // check that none of the OpBranch instructions are dependent on a load as we
41 // leave control flow structure intact and move only instructions in the body so
42 // we want to avoid any loads with side affects or aliasing.
43 //
44 // 4 - We then split the loop by calling SplitLoop. This function clones the
45 // loop and attaches it to the preheader and connects the new loops merge block
46 // to the current loop header block. We then use the two sets built in step 2 to
47 // remove instructions from each loop. If an instruction appears in the first
48 // set it is removed from the second loop and vice versa.
49 //
50 // 5 - If the multiple split passes flag is set we check if each of the loops
51 // still meet the register pressure criteria. If they do then we add them to the
52 // list of loops to be split (created in step one) to allow for loops to be
53 // split multiple times.
54 //
55 
56 namespace spvtools {
57 namespace opt {
58 
59 class LoopFissionImpl {
60  public:
LoopFissionImpl(IRContext * context,Loop * loop)61   LoopFissionImpl(IRContext* context, Loop* loop)
62       : context_(context), loop_(loop), load_used_in_condition_(false) {}
63 
64   // Group each instruction in the loop into sets of instructions related by
65   // their usedef chains. An instruction which uses another will appear in the
66   // same set. Then merge those sets into just two sets. Returns false if there
67   // was one or less sets created.
68   bool GroupInstructionsByUseDef();
69 
70   // Check if the sets built by GroupInstructionsByUseDef violate any data
71   // dependence rules.
72   bool CanPerformSplit();
73 
74   // Split the loop and return a pointer to the new loop.
75   Loop* SplitLoop();
76 
77   // Checks if |inst| is safe to move. We can only move instructions which don't
78   // have any side effects and OpLoads and OpStores.
79   bool MovableInstruction(const Instruction& inst) const;
80 
81  private:
82   // Traverse the def use chain of |inst| and add the users and uses of |inst|
83   // which are in the same loop to the |returned_set|.
84   void TraverseUseDef(Instruction* inst, std::set<Instruction*>* returned_set,
85                       bool ignore_phi_users = false, bool report_loads = false);
86 
87   // We group the instructions in the block into two different groups, the
88   // instructions to be kept in the original loop and the ones to be cloned into
89   // the new loop. As the cloned loop is attached to the preheader it will be
90   // the first loop and the second loop will be the original.
91   std::set<Instruction*> cloned_loop_instructions_;
92   std::set<Instruction*> original_loop_instructions_;
93 
94   // We need a set of all the instructions to be seen so we can break any
95   // recursion and also so we can ignore certain instructions by preemptively
96   // adding them to this set.
97   std::set<Instruction*> seen_instructions_;
98 
99   // A map of instructions to their relative position in the function.
100   std::map<Instruction*, size_t> instruction_order_;
101 
102   IRContext* context_;
103 
104   Loop* loop_;
105 
106   // This is set to true by TraverseUseDef when traversing the instructions
107   // related to the loop condition and any if conditions should any of those
108   // instructions be a load.
109   bool load_used_in_condition_;
110 };
111 
MovableInstruction(const Instruction & inst) const112 bool LoopFissionImpl::MovableInstruction(const Instruction& inst) const {
113   return inst.opcode() == SpvOp::SpvOpLoad ||
114          inst.opcode() == SpvOp::SpvOpStore ||
115          inst.opcode() == SpvOp::SpvOpSelectionMerge ||
116          inst.opcode() == SpvOp::SpvOpPhi || inst.IsOpcodeCodeMotionSafe();
117 }
118 
TraverseUseDef(Instruction * inst,std::set<Instruction * > * returned_set,bool ignore_phi_users,bool report_loads)119 void LoopFissionImpl::TraverseUseDef(Instruction* inst,
120                                      std::set<Instruction*>* returned_set,
121                                      bool ignore_phi_users, bool report_loads) {
122   assert(returned_set && "Set to be returned cannot be null.");
123 
124   analysis::DefUseManager* def_use = context_->get_def_use_mgr();
125   std::set<Instruction*>& inst_set = *returned_set;
126 
127   // We create this functor to traverse the use def chain to build the
128   // grouping of related instructions. The lambda captures the std::function
129   // to allow it to recurse.
130   std::function<void(Instruction*)> traverser_functor;
131   traverser_functor = [this, def_use, &inst_set, &traverser_functor,
132                        ignore_phi_users, report_loads](Instruction* user) {
133     // If we've seen the instruction before or it is not inside the loop end the
134     // traversal.
135     if (!user || seen_instructions_.count(user) != 0 ||
136         !context_->get_instr_block(user) ||
137         !loop_->IsInsideLoop(context_->get_instr_block(user))) {
138       return;
139     }
140 
141     // Don't include labels or loop merge instructions in the instruction sets.
142     // Including them would mean we group instructions related only by using the
143     // same labels (i.e phis). We already preempt the inclusion of
144     // OpSelectionMerge by adding related instructions to the seen_instructions_
145     // set.
146     if (user->opcode() == SpvOp::SpvOpLoopMerge ||
147         user->opcode() == SpvOp::SpvOpLabel)
148       return;
149 
150     // If the |report_loads| flag is set, set the class field
151     // load_used_in_condition_ to false. This is used to check that none of the
152     // condition checks in the loop rely on loads.
153     if (user->opcode() == SpvOp::SpvOpLoad && report_loads) {
154       load_used_in_condition_ = true;
155     }
156 
157     // Add the instruction to the set of instructions already seen, this breaks
158     // recursion and allows us to ignore certain instructions.
159     seen_instructions_.insert(user);
160 
161     inst_set.insert(user);
162 
163     // Wrapper functor to traverse the operands of each instruction.
164     auto traverse_operand = [&traverser_functor, def_use](const uint32_t* id) {
165       traverser_functor(def_use->GetDef(*id));
166     };
167     user->ForEachInOperand(traverse_operand);
168 
169     // For the first traversal we want to ignore the users of the phi.
170     if (ignore_phi_users && user->opcode() == SpvOp::SpvOpPhi) return;
171 
172     // Traverse each user with this lambda.
173     def_use->ForEachUser(user, traverser_functor);
174 
175     // Wrapper functor for the use traversal.
176     auto traverse_use = [&traverser_functor](Instruction* use, uint32_t) {
177       traverser_functor(use);
178     };
179     def_use->ForEachUse(user, traverse_use);
180 
181   };
182 
183   // We start the traversal of the use def graph by invoking the above
184   // lambda with the |inst| parameter.
185   traverser_functor(inst);
186 }
187 
GroupInstructionsByUseDef()188 bool LoopFissionImpl::GroupInstructionsByUseDef() {
189   std::vector<std::set<Instruction*>> sets{};
190 
191   // We want to ignore all the instructions stemming from the loop condition
192   // instruction.
193   BasicBlock* condition_block = loop_->FindConditionBlock();
194 
195   if (!condition_block) return false;
196   Instruction* condition = &*condition_block->tail();
197 
198   // We iterate over the blocks via iterating over all the blocks in the
199   // function, we do this so we are iterating in the same order which the blocks
200   // appear in the binary.
201   Function& function = *loop_->GetHeaderBlock()->GetParent();
202 
203   // Create a temporary set to ignore certain groups of instructions within the
204   // loop. We don't want any instructions related to control flow to be removed
205   // from either loop only instructions within the control flow bodies.
206   std::set<Instruction*> instructions_to_ignore{};
207   TraverseUseDef(condition, &instructions_to_ignore, true, true);
208 
209   // Traverse control flow instructions to ensure they are added to the
210   // seen_instructions_ set and will be ignored when it it called with actual
211   // sets.
212   for (BasicBlock& block : function) {
213     if (!loop_->IsInsideLoop(block.id())) continue;
214 
215     for (Instruction& inst : block) {
216       // Ignore all instructions related to control flow.
217       if (inst.opcode() == SpvOp::SpvOpSelectionMerge || inst.IsBranch()) {
218         TraverseUseDef(&inst, &instructions_to_ignore, true, true);
219       }
220     }
221   }
222 
223   // Traverse the instructions and generate the sets, automatically ignoring any
224   // instructions in instructions_to_ignore.
225   for (BasicBlock& block : function) {
226     if (!loop_->IsInsideLoop(block.id()) ||
227         loop_->GetHeaderBlock()->id() == block.id())
228       continue;
229 
230     for (Instruction& inst : block) {
231       // Record the order that each load/store is seen.
232       if (inst.opcode() == SpvOp::SpvOpLoad ||
233           inst.opcode() == SpvOp::SpvOpStore) {
234         instruction_order_[&inst] = instruction_order_.size();
235       }
236 
237       // Ignore instructions already seen in a traversal.
238       if (seen_instructions_.count(&inst) != 0) {
239         continue;
240       }
241 
242       // Build the set.
243       std::set<Instruction*> inst_set{};
244       TraverseUseDef(&inst, &inst_set);
245       if (!inst_set.empty()) sets.push_back(std::move(inst_set));
246     }
247   }
248 
249   // If we have one or zero sets return false to indicate that due to
250   // insufficient instructions we couldn't split the loop into two groups and
251   // thus the loop can't be split any further.
252   if (sets.size() < 2) {
253     return false;
254   }
255 
256   // Merge the loop sets into two different sets. In CanPerformSplit we will
257   // validate that we don't break the relative ordering of loads/stores by doing
258   // this.
259   for (size_t index = 0; index < sets.size() / 2; ++index) {
260     cloned_loop_instructions_.insert(sets[index].begin(), sets[index].end());
261   }
262   for (size_t index = sets.size() / 2; index < sets.size(); ++index) {
263     original_loop_instructions_.insert(sets[index].begin(), sets[index].end());
264   }
265 
266   return true;
267 }
268 
CanPerformSplit()269 bool LoopFissionImpl::CanPerformSplit() {
270   // Return false if any of the condition instructions in the loop depend on a
271   // load.
272   if (load_used_in_condition_) {
273     return false;
274   }
275 
276   // Build a list of all parent loops of this loop. Loop dependence analysis
277   // needs this structure.
278   std::vector<const Loop*> loops;
279   Loop* parent_loop = loop_;
280   while (parent_loop) {
281     loops.push_back(parent_loop);
282     parent_loop = parent_loop->GetParent();
283   }
284 
285   LoopDependenceAnalysis analysis{context_, loops};
286 
287   // A list of all the stores in the cloned loop.
288   std::vector<Instruction*> set_one_stores{};
289 
290   // A list of all the loads in the cloned loop.
291   std::vector<Instruction*> set_one_loads{};
292 
293   // Populate the above lists.
294   for (Instruction* inst : cloned_loop_instructions_) {
295     if (inst->opcode() == SpvOp::SpvOpStore) {
296       set_one_stores.push_back(inst);
297     } else if (inst->opcode() == SpvOp::SpvOpLoad) {
298       set_one_loads.push_back(inst);
299     }
300 
301     // If we find any instruction which we can't move (such as a barrier),
302     // return false.
303     if (!MovableInstruction(*inst)) return false;
304   }
305 
306   // We need to calculate the depth of the loop to create the loop dependency
307   // distance vectors.
308   const size_t loop_depth = loop_->GetDepth();
309 
310   // Check the dependencies between loads in the cloned loop and stores in the
311   // original and vice versa.
312   for (Instruction* inst : original_loop_instructions_) {
313     // If we find any instruction which we can't move (such as a barrier),
314     // return false.
315     if (!MovableInstruction(*inst)) return false;
316 
317     // Look at the dependency between the loads in the original and stores in
318     // the cloned loops.
319     if (inst->opcode() == SpvOp::SpvOpLoad) {
320       for (Instruction* store : set_one_stores) {
321         DistanceVector vec{loop_depth};
322 
323         // If the store actually should appear after the load, return false.
324         // This means the store has been placed in the wrong grouping.
325         if (instruction_order_[store] > instruction_order_[inst]) {
326           return false;
327         }
328         // If not independent check the distance vector.
329         if (!analysis.GetDependence(store, inst, &vec)) {
330           for (DistanceEntry& entry : vec.GetEntries()) {
331             // A distance greater than zero means that the store in the cloned
332             // loop has a dependency on the load in the original loop.
333             if (entry.distance > 0) return false;
334           }
335         }
336       }
337     } else if (inst->opcode() == SpvOp::SpvOpStore) {
338       for (Instruction* load : set_one_loads) {
339         DistanceVector vec{loop_depth};
340 
341         // If the load actually should appear after the store, return false.
342         if (instruction_order_[load] > instruction_order_[inst]) {
343           return false;
344         }
345 
346         // If not independent check the distance vector.
347         if (!analysis.GetDependence(inst, load, &vec)) {
348           for (DistanceEntry& entry : vec.GetEntries()) {
349             // A distance less than zero means the load in the cloned loop is
350             // dependent on the store instruction in the original loop.
351             if (entry.distance < 0) return false;
352           }
353         }
354       }
355     }
356   }
357   return true;
358 }
359 
SplitLoop()360 Loop* LoopFissionImpl::SplitLoop() {
361   // Clone the loop.
362   LoopUtils util{context_, loop_};
363   LoopUtils::LoopCloningResult clone_results;
364   Loop* cloned_loop = util.CloneAndAttachLoopToHeader(&clone_results);
365 
366   // Update the OpLoopMerge in the cloned loop.
367   cloned_loop->UpdateLoopMergeInst();
368 
369   // Add the loop_ to the module.
370   // TODO(1841): Handle failure to create pre-header.
371   Function::iterator it =
372       util.GetFunction()->FindBlock(loop_->GetOrCreatePreHeaderBlock()->id());
373   util.GetFunction()->AddBasicBlocks(clone_results.cloned_bb_.begin(),
374                                      clone_results.cloned_bb_.end(), ++it);
375   loop_->SetPreHeaderBlock(cloned_loop->GetMergeBlock());
376 
377   std::vector<Instruction*> instructions_to_kill{};
378 
379   // Kill all the instructions which should appear in the cloned loop but not in
380   // the original loop.
381   for (uint32_t id : loop_->GetBlocks()) {
382     BasicBlock* block = context_->cfg()->block(id);
383 
384     for (Instruction& inst : *block) {
385       // If the instruction appears in the cloned loop instruction group, kill
386       // it.
387       if (cloned_loop_instructions_.count(&inst) == 1 &&
388           original_loop_instructions_.count(&inst) == 0) {
389         instructions_to_kill.push_back(&inst);
390         if (inst.opcode() == SpvOp::SpvOpPhi) {
391           context_->ReplaceAllUsesWith(
392               inst.result_id(), clone_results.value_map_[inst.result_id()]);
393         }
394       }
395     }
396   }
397 
398   // Kill all instructions which should appear in the original loop and not in
399   // the cloned loop.
400   for (uint32_t id : cloned_loop->GetBlocks()) {
401     BasicBlock* block = context_->cfg()->block(id);
402     for (Instruction& inst : *block) {
403       Instruction* old_inst = clone_results.ptr_map_[&inst];
404       // If the instruction belongs to the original loop instruction group, kill
405       // it.
406       if (cloned_loop_instructions_.count(old_inst) == 0 &&
407           original_loop_instructions_.count(old_inst) == 1) {
408         instructions_to_kill.push_back(&inst);
409       }
410     }
411   }
412 
413   for (Instruction* i : instructions_to_kill) {
414     context_->KillInst(i);
415   }
416 
417   return cloned_loop;
418 }
419 
LoopFissionPass(const size_t register_threshold_to_split,bool split_multiple_times)420 LoopFissionPass::LoopFissionPass(const size_t register_threshold_to_split,
421                                  bool split_multiple_times)
422     : split_multiple_times_(split_multiple_times) {
423   // Split if the number of registers in the loop exceeds
424   // |register_threshold_to_split|.
425   split_criteria_ =
426       [register_threshold_to_split](
427           const RegisterLiveness::RegionRegisterLiveness& liveness) {
428         return liveness.used_registers_ > register_threshold_to_split;
429       };
430 }
431 
LoopFissionPass()432 LoopFissionPass::LoopFissionPass() : split_multiple_times_(false) {
433   // Split by default.
434   split_criteria_ = [](const RegisterLiveness::RegionRegisterLiveness&) {
435     return true;
436   };
437 }
438 
ShouldSplitLoop(const Loop & loop,IRContext * c)439 bool LoopFissionPass::ShouldSplitLoop(const Loop& loop, IRContext* c) {
440   LivenessAnalysis* analysis = c->GetLivenessAnalysis();
441 
442   RegisterLiveness::RegionRegisterLiveness liveness{};
443 
444   Function* function = loop.GetHeaderBlock()->GetParent();
445   analysis->Get(function)->ComputeLoopRegisterPressure(loop, &liveness);
446 
447   return split_criteria_(liveness);
448 }
449 
Process()450 Pass::Status LoopFissionPass::Process() {
451   bool changed = false;
452 
453   for (Function& f : *context()->module()) {
454     // We collect all the inner most loops in the function and run the loop
455     // splitting util on each. The reason we do this is to allow us to iterate
456     // over each, as creating new loops will invalidate the the loop iterator.
457     std::vector<Loop*> inner_most_loops{};
458     LoopDescriptor& loop_descriptor = *context()->GetLoopDescriptor(&f);
459     for (Loop& loop : loop_descriptor) {
460       if (!loop.HasChildren() && ShouldSplitLoop(loop, context())) {
461         inner_most_loops.push_back(&loop);
462       }
463     }
464 
465     // List of new loops which meet the criteria to be split again.
466     std::vector<Loop*> new_loops_to_split{};
467 
468     while (!inner_most_loops.empty()) {
469       for (Loop* loop : inner_most_loops) {
470         LoopFissionImpl impl{context(), loop};
471 
472         // Group the instructions in the loop into two different sets of related
473         // instructions. If we can't group the instructions into the two sets
474         // then we can't split the loop any further.
475         if (!impl.GroupInstructionsByUseDef()) {
476           continue;
477         }
478 
479         if (impl.CanPerformSplit()) {
480           Loop* second_loop = impl.SplitLoop();
481           changed = true;
482           context()->InvalidateAnalysesExceptFor(
483               IRContext::kAnalysisLoopAnalysis);
484 
485           // If the newly created loop meets the criteria to be split, split it
486           // again.
487           if (ShouldSplitLoop(*second_loop, context()))
488             new_loops_to_split.push_back(second_loop);
489 
490           // If the original loop (now split) still meets the criteria to be
491           // split, split it again.
492           if (ShouldSplitLoop(*loop, context()))
493             new_loops_to_split.push_back(loop);
494         }
495       }
496 
497       // If the split multiple times flag has been set add the new loops which
498       // meet the splitting criteria into the list of loops to be split on the
499       // next iteration.
500       if (split_multiple_times_) {
501         inner_most_loops = std::move(new_loops_to_split);
502       } else {
503         break;
504       }
505     }
506   }
507 
508   return changed ? Pass::Status::SuccessWithChange
509                  : Pass::Status::SuccessWithoutChange;
510 }
511 
512 }  // namespace opt
513 }  // namespace spvtools
514