1 #include "LoopNest.h"
2 
3 using std::set;
4 using std::vector;
5 
6 namespace Halide {
7 namespace Internal {
8 namespace Autoscheduler {
9 
10 // How small should an innermost loop cluster be before you just
11 // entirely unroll the thing. Sized for an architecture with 16 vector
12 // registers.
13 const int kUnrollLimit = 12;
14 
15 // Get the HL_NO_SUBTILING environment variable. Purpose described above.
get_may_subtile()16 bool get_may_subtile() {
17     string no_subtiling_str = get_env_variable("HL_NO_SUBTILING");
18     if (no_subtiling_str == "1") {
19         return false;
20     } else {
21         return true;
22     }
23 }
may_subtile()24 bool may_subtile() {
25     static bool b = get_may_subtile();
26     return b;
27 }
28 
29 // Given a multi-dimensional box of dimensionality d, generate a list
30 // of candidate tile sizes for it, logarithmically spacing the sizes
31 // using the given factor. If 'allow_splits' is false, every dimension
32 // must either be one, or the full extent of the box. This function is
33 // used to generate candidate tilings when tiling for
34 // producer-consumer fusion, or tiling for parallelism.
generate_tilings(const vector<int64_t> & s,int d,int factor,bool allow_splits)35 vector<vector<int64_t>> generate_tilings(const vector<int64_t> &s, int d, int factor, bool allow_splits) {
36     vector<vector<int64_t>> result;
37     if (d == -1) {
38         result.push_back(vector<int64_t>());
39     } else {
40         vector<vector<int64_t>> v = generate_tilings(s, d - 1, factor, allow_splits);
41         // If we're already generated too many tiling configurations
42         // for the inner loops, search the outer loops with coarser
43         // granularity.
44         while (v.size() > (size_t)factor * 100) {
45             factor *= 2;
46         }
47 
48         for (auto &t : v) {
49             bool is_full = false, is_one = false;
50             // Skip trivial tilings
51             if ((size_t)d == s.size() - 1) {
52                 is_one = is_full = true;
53                 for (int i = 0; i < d; i++) {
54                     is_one &= (t[i] == 1);
55                     is_full &= (t[i] == s[i]);
56                 }
57             }
58             t.push_back(0);
59             if (!allow_splits) {
60                 if (!is_one) {
61                     t.back() = 1;
62                     result.push_back(t);
63                 }
64                 if (s[d] != 1 && !is_full) {
65                     t.back() = s[d];
66                     result.push_back(t);
67                 }
68             } else {
69                 int max_inner = 0;
70                 for (int inner = 1; inner < s[d]; inner *= factor) {
71                     int outer = (s[d] + inner - 1) / inner;
72                     if (is_one && outer == 1) continue;
73                     if (is_full && outer == s[d]) continue;
74                     // Stop when we hit inner sizes that would do too much recompute
75                     if (inner > 1 && inner * outer * 7 > s[d] * 8) break;
76                     max_inner = inner;
77                     t.back() = outer;
78                     result.push_back(t);
79                 }
80                 for (int outer = 1; outer <= s[d]; outer *= factor) {
81                     int inner = (s[d] + outer - 1) / outer;
82                     if (is_one && outer == 1) continue;
83                     if (is_full && outer == s[d]) continue;
84                     // Stop when we get into the regime covered by the loop above.
85                     if (outer > 1 && inner < max_inner * 2) break;
86                     // Or when the wasted compute gets too bad.
87                     if (inner * outer * 7 > s[d] * 8) break;
88                     t.back() = outer;
89                     result.push_back(t);
90                 }
91 
92                 // The sequence above (in terms of the inner loop)
93                 // goes 1 2 4 8 16 ...  but 3 is an important inner
94                 // tiling factor for matrix multiply/gemm-type loops
95                 // which try to use 12 vector registers.
96                 int inner3 = 3;
97                 int outer3 = (s[d] + inner3 - 1) / inner3;
98                 if (factor == 2 && inner3 < s[d] && outer3 < s[d] && outer3 > 1) {
99                     if (inner3 * outer3 * 7 <= s[d] * 8) {
100                         t.back() = outer3;
101                         result.push_back(t);
102                     }
103                 }
104             }
105         }
106     }
107     return result;
108 }
109 
copy_from(const LoopNest & n)110 void LoopNest::copy_from(const LoopNest &n) {
111     size = n.size;
112     children = n.children;
113     inlined = n.inlined;
114     store_at = n.store_at;
115     bounds = n.bounds;
116     node = n.node;
117     stage = n.stage;
118     innermost = n.innermost;
119     tileable = n.tileable;
120     parallel = n.parallel;
121     vector_dim = n.vector_dim;
122     vectorized_loop_index = n.vectorized_loop_index;
123 };
124 
125 // Hash the loop structure and sizes up to a fixed depth. This is
126 // used as the hash function for the coarse-to-fine beam search in
127 // the paper.
structural_hash(uint64_t & h,int depth) const128 void LoopNest::structural_hash(uint64_t &h, int depth) const {
129     if (depth < 0) return;
130 
131     // Which Funcs are store_at this level?
132     for (const auto *n : store_at) {
133         hash_combine(h, n->id);
134     }
135 
136     hash_combine(h, -1);
137 
138     // Which Funcs are compute_at this level?
139     for (const auto &c : children) {
140         hash_combine(h, c->stage->id);
141     }
142 
143     // Add a barrier to ensure that moving something from the last
144     // compute_at to the first inlined doesn't result in the same
145     // hash.
146     hash_combine(h, -1);
147 
148     // Which Funcs are inlined at this level?
149     for (auto it = inlined.begin(); it != inlined.end(); it++) {
150         hash_combine(h, it.key()->id);
151     }
152 
153     hash_combine(h, -1);
154 
155     if (depth > 0) {
156         // What are the loop sizes of the children?
157         for (const auto &c : children) {
158             for (int64_t s : c->size) {
159                 if (depth == 1) {
160                     // Just take the most significant bit: is it one or not?
161                     s = (s > 1) ? 1 : 0;
162                 }
163                 hash_combine(h, s);
164             }
165         }
166 
167         // Which dimension are we vectorized over?
168         hash_combine(h, vectorized_loop_index);
169     }
170 
171     if (depth > 1) {
172         // Descend into children
173         for (const auto &c : children) {
174             c->structural_hash(h, depth - 2);
175         }
176     }
177 }
178 
179 // Compute all the sites of interest for each pipeline stage
get_sites(StageMap<Sites> & sites,const LoopNest * task,const LoopNest * parent) const180 void LoopNest::get_sites(StageMap<Sites> &sites,
181                          const LoopNest *task,
182                          const LoopNest *parent) const {
183     if (!task && !is_root()) {
184         task = this;
185     }
186     for (const auto &c : children) {
187         c->get_sites(sites, task, this);
188     }
189     if (parent && node != parent->node) {
190         auto &s = sites.get_or_create(stage);
191         s.compute = parent;
192         s.produce = this;
193         s.task = task;
194     }
195     for (auto f : store_at) {
196         for (const auto &s : f->stages) {
197             sites.get_or_create(&s).store = this;
198         }
199     }
200     for (auto it = inlined.begin(); it != inlined.end(); it++) {
201         auto &s = sites.get_or_create(&(it.key()->stages[0]));
202         s.inlined = true;
203         s.compute = s.store = s.produce = s.innermost = this;
204         s.task = task;
205     }
206     if (innermost) {
207         sites.get_or_create(stage).innermost = this;
208     }
209 }
210 
211 // Do a recursive walk over the loop nest computing features to feed the cost model.
compute_features(const FunctionDAG & dag,const MachineParams & params,const StageMap<Sites> & sites,int64_t instances,int64_t parallelism,const LoopNest * parent,const LoopNest * grandparent,const LoopNest & root,int64_t * working_set,StageMap<ScheduleFeatures> * features) const212 void LoopNest::compute_features(const FunctionDAG &dag,
213                                 const MachineParams &params,
214                                 const StageMap<Sites> &sites,
215                                 int64_t instances,
216                                 int64_t parallelism,
217                                 const LoopNest *parent,
218                                 const LoopNest *grandparent,
219                                 const LoopNest &root,
220                                 int64_t *working_set,
221                                 StageMap<ScheduleFeatures> *features) const {
222     int64_t working_set_here = 0;
223 
224     int64_t loop_instances = 1, parallel_tasks = 1;
225     bool in_impure = false;
226     for (int idx = (int)size.size() - 1; idx >= 0; idx--) {
227         size_t i = size[idx];
228         loop_instances *= i;
229         if (stage->loop[idx].pure && !in_impure) {
230             if (params.parallelism > 1 &&
231                 (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) {
232                 // Either we've picked our parallel tiling, or
233                 // it's not yet determined. Assume we'll not split
234                 // any loops and just stop after we hit the
235                 // required number of cores
236                 parallel_tasks *= i;
237                 // If we haven't picked out parallel tiling yet,
238                 // assume that we'll target 8*cores when we do,
239                 // which is a common rule of thumb.
240                 if (!parallel && parallel_tasks > params.parallelism * 8) {
241                     // We would split this loop
242                     parallel_tasks = params.parallelism * 8;
243                 }
244             }
245         } else if (i != 1) {
246             in_impure = true;
247         }
248     }
249 
250     int64_t subinstances = instances * loop_instances;
251 
252     for (const auto *node : store_at) {
253         // Figure out the features at the store_at level
254         const auto &bounds = get_bounds(node);
255 
256         for (size_t s = 0; s < node->stages.size(); s++) {
257             // TODO: Lift invariants from this loop. Most of it's the same for every stage.
258             internal_assert(!node->is_input);
259             ScheduleFeatures &feat = features->get_or_create(&(node->stages[s]));
260 
261             feat.num_realizations = subinstances;
262 
263             feat.points_computed_per_realization = 1;
264             feat.num_scalars = feat.num_vectors = subinstances;
265             bool vectorized = false;
266             for (int i = 0; i < (int)node->stages[s].loop.size(); i++) {
267                 const auto &p = bounds->loops(s, i);
268                 int64_t extent = p.extent();
269                 feat.points_computed_per_realization *= extent;
270                 if (i == sites.get(&(node->stages[s])).produce->vectorized_loop_index) {
271                     // Assumes that we're not going to split
272                     // things such that non-native-width
273                     // vectorization is a problem, except for the
274                     // tail.
275                     feat.num_vectors *= extent / node->stages[s].vector_size;
276                     feat.num_scalars *= extent % node->stages[s].vector_size;
277                     vectorized = true;
278                 } else {
279                     feat.num_vectors *= extent;
280                     feat.num_scalars *= extent;
281                 }
282             }
283             if (!vectorized) {
284                 feat.num_vectors = 0;
285             }
286             feat.points_computed_total = feat.points_computed_per_realization * feat.num_realizations;
287 
288             feat.bytes_at_realization = node->bytes_per_point;
289             for (int i = 0; i < node->dimensions; i++) {
290                 const auto &p = bounds->region_computed(i);
291                 feat.bytes_at_realization *= p.extent();
292             }
293             int64_t innermost_storage_extent = 1;
294             int v = sites.get(&(node->stages[s])).produce->vector_dim;
295             if (v >= 0 && node->dimensions > 0) {
296                 innermost_storage_extent = bounds->region_computed(v).extent();
297             }
298             feat.innermost_bytes_at_realization = node->bytes_per_point * innermost_storage_extent;
299 
300             if (!is_root()) {
301                 feat.bytes_at_task = feat.bytes_at_realization;
302                 feat.innermost_bytes_at_task = feat.innermost_bytes_at_realization;
303             }
304         }
305     }
306 
307     if (is_root()) {
308         // TODO: This block of code is repeated below. Refactor
309         for (const auto &c : children) {
310             c->compute_features(dag, params, sites, subinstances, parallelism, this, parent, root, &working_set_here, features);
311         }
312 
313         for (const auto *node : store_at) {
314             auto &feat = features->get(&(node->stages[0]));
315             working_set_here += feat.bytes_at_production;
316         }
317         for (const auto *node : store_at) {
318             for (const auto &s : node->stages) {
319                 auto &feat = features->get(&s);
320                 feat.working_set_at_realization = working_set_here;
321             }
322         }
323         for (const auto &c : children) {
324             if (c->node != node) {
325                 auto &feat = features->get(c->stage);
326                 feat.working_set_at_production = working_set_here;
327             }
328         }
329 
330         // Figure out the root-level features for every Func
331         for (auto it = features->begin(); it != features->end(); it++) {
332             const auto *stage = it.key();
333             const auto *node = stage->node;
334             auto &feat = it.value();
335             const auto &root_bounds = root.get_bounds(node);
336 
337             feat.bytes_at_root = node->bytes_per_point;
338             for (int i = 0; i < node->dimensions; i++) {
339                 const auto &p = root_bounds->region_computed(i);
340                 feat.bytes_at_root *= p.extent();
341             }
342 
343             feat.working_set_at_root = working_set_here;
344 
345             auto *p = sites.get(stage).produce;
346             if (p) {
347                 // Extent of the innermost dimension in the storage layout
348                 int64_t innermost_storage_extent = 1;
349                 int v = p->vector_dim;
350                 if (v >= 0 && v < node->dimensions) {
351                     innermost_storage_extent = root_bounds->region_computed(v).extent();
352                 }
353                 feat.innermost_bytes_at_root = node->bytes_per_point * innermost_storage_extent;
354             } else {
355                 feat.innermost_bytes_at_root = 0;
356             }
357 
358             feat.points_computed_minimum = 1;
359             for (int i = 0; i < (int)stage->loop.size(); i++) {
360                 const auto &p = root_bounds->loops(stage->index, i);
361                 feat.points_computed_minimum *= p.extent();
362             }
363 
364             if (node->stages.size() == 1 && !node->is_output) {
365                 int64_t points_computed_minimum_if_inlined = 0;
366                 for (auto *e : node->outgoing_edges) {
367                     points_computed_minimum_if_inlined += features->get(e->consumer).points_computed_minimum * e->calls;
368                 }
369                 feat.points_computed_minimum = std::min(feat.points_computed_minimum, (double)points_computed_minimum_if_inlined);
370             }
371         }
372 
373         return;
374     }
375 
376     int64_t subparallelism = parallel_tasks * parallelism;
377 
378     // Figure out the features at the compute_at level
379     internal_assert(!stage->node->is_input);
380     ScheduleFeatures &feat = features->get_or_create(stage);
381 
382     if (innermost) {
383         if (vectorized_loop_index >= 0 && vectorized_loop_index < (int)size.size()) {
384             feat.vector_size = size[vectorized_loop_index];
385         } else {
386             feat.vector_size = 1;
387         }
388         if (feat.vector_size == 1) {
389             // They're all scalars
390             feat.num_scalars += feat.num_vectors;
391             feat.num_vectors = 0;
392         }
393     } else {
394         // We want these features just outside the innermost loop,
395         // so just set them at every level and let them get
396         // progressively overwritten as we descend the loop nest
397         // tree.
398         size_t idx = 0;
399         feat.innermost_loop_extent = 1;
400         feat.innermost_pure_loop_extent = 1;
401         for (const auto &l : stage->loop) {
402             feat.innermost_loop_extent *= size[idx];
403             if (!l.rvar) {
404                 feat.innermost_pure_loop_extent *= size[idx];
405             }
406             idx++;
407         }
408     }
409 
410     const bool at_task = parent->is_root();
411     const bool at_production = parent->node != node;
412     const bool at_pure_production = at_production && stage->index == 0;
413 
414     if (at_task) {
415         if (parallel) {
416             const auto &bounds = get_bounds(node);
417             feat.bytes_at_task = node->bytes_per_point;
418             int64_t innermost_storage_extent = 1;
419             for (int i = 0; i < node->dimensions; i++) {
420                 int64_t outer = 1;
421                 for (size_t l = 0; l < stage->loop.size(); l++) {
422                     if (stage->loop[l].var == node->func.args()[i]) {
423                         outer = size[l];
424                         break;
425                     }
426                 }
427                 const auto &p = bounds->region_computed(i);
428                 int64_t extent = p.extent();
429                 extent /= outer;
430                 feat.bytes_at_task *= extent;
431                 if (i == vector_dim) {
432                     innermost_storage_extent = extent;
433                 }
434             }
435             feat.innermost_bytes_at_task = node->bytes_per_point * innermost_storage_extent;
436         } else {
437             // How this loop will be parallelized is not yet
438             // determined. Use optimistic values for the features.
439             feat.bytes_at_task = (feat.bytes_at_realization + params.parallelism - 1) / params.parallelism;
440             feat.innermost_bytes_at_task = std::min(feat.bytes_at_task, feat.innermost_bytes_at_realization);
441         }
442 
443         feat.unique_bytes_read_per_task = 0;
444         feat.unique_lines_read_per_task = 0;
445 
446         // We're at a parallel for loop. Check all the accesses
447         // done by Funcs inside this loop to values computed
448         // outside of it to figure out how much data we'll be
449         // streaming onto the core.
450         vector<const FunctionDAG::Edge *> pending;
451         set<const FunctionDAG::Node *> done;
452         for (const auto *e : stage->incoming_edges) {
453             pending.push_back(e);
454         }
455         while (!pending.empty()) {
456             const auto *e = pending.back();
457             pending.pop_back();
458             if (done.count(e->producer)) continue;
459             done.insert(e->producer);
460             const auto &site = sites.get(&(e->producer->stages[0]));
461             if (site.store->is_root()) {
462                 const auto &b = get_bounds(e->producer);
463                 int64_t bytes = e->producer->bytes_per_point, lines = 1;
464                 int64_t max_extent = 1;
465                 // clang-format off
466                 int vector_dim = (e->producer->is_input   ? 0 :
467                                   site.produce != nullptr ? site.produce->vector_dim :
468                                                             -1);
469                 // clang-format on
470                 for (int i = 0; i < e->producer->dimensions; i++) {
471                     int64_t extent = b->region_required(i).extent();
472                     max_extent = std::max(extent, max_extent);
473                     bytes *= extent;
474                     if (i != vector_dim) {
475                         lines *= extent;
476                     }
477                 }
478                 if (!e->producer->is_input && site.produce == nullptr) {
479                     // We haven't scheduled the producer so we
480                     // don't know the memory layout yet. Assume
481                     // the best case.
482                     lines /= max_extent;
483                 }
484                 feat.unique_bytes_read_per_task += bytes;
485                 feat.unique_lines_read_per_task += lines;
486 
487             } else if (site.produce != nullptr) {
488                 // Computation must be nested inside this task or inlined into it.
489                 for (const auto &s : e->producer->stages) {
490                     for (const auto *e2 : s.incoming_edges) {
491                         pending.push_back(e2);
492                     }
493                 }
494             }
495         }
496     }
497 
498     if (at_production) {
499         feat.num_productions = instances;
500         feat.inner_parallelism = parallel_tasks;
501         feat.outer_parallelism = parallelism;
502         feat.native_vector_size = stage->vector_size;
503 
504         const auto &bounds = parent->get_bounds(node);
505 
506         feat.bytes_at_production = node->bytes_per_point;
507         for (int i = 0; i < node->dimensions; i++) {
508             const auto &p = bounds->region_computed(i);
509             feat.bytes_at_production *= p.extent();
510         }
511         int64_t innermost_storage_extent = 1;
512         if (vector_dim >= 0 && node->dimensions > 0) {
513             innermost_storage_extent = bounds->region_computed(vector_dim).extent();
514         }
515         feat.innermost_bytes_at_production = node->bytes_per_point * innermost_storage_extent;
516     }
517 
518     // Recurse inwards
519     for (const auto &c : children) {
520         c->compute_features(dag, params, sites, subinstances, subparallelism, this, parent, root, &working_set_here, features);
521     }
522     for (const auto *node : store_at) {
523         auto &feat = features->get(&(node->stages[0]));
524         working_set_here += feat.bytes_at_production;
525     }
526     for (const auto *node : store_at) {
527         for (const auto &s : node->stages) {
528             auto &feat = features->get(&s);
529             feat.working_set_at_realization = working_set_here;
530         }
531     }
532     for (const auto &c : children) {
533         if (c->node != node) {
534             auto &feat = features->get(c->stage);
535             feat.working_set_at_production = working_set_here;
536         }
537     }
538 
539     if (at_task) {
540         set_working_set_at_task_feature(working_set_here, features);
541     }
542 
543     if (at_production) {
544         feat.working_set = working_set_here;
545     }
546 
547     if (innermost) {
548         bool parent_unrolled =
549             (feat.innermost_pure_loop_extent <= kUnrollLimit &&
550              parent->node == node);
551 
552         if (parent_unrolled) {
553             const auto &grandparent_bounds = grandparent->get_bounds(node);
554             for (size_t i = 0; i < parent->size.size(); i++) {
555                 if (!stage->loop[i].rvar) {
556                     const auto &l = grandparent_bounds->loops(parent->stage->index, i);
557                     parent_unrolled &= l.constant_extent();
558                 }
559             }
560         }
561 
562         if (parent_unrolled) {
563             feat.unrolled_loop_extent = feat.innermost_pure_loop_extent;
564         } else {
565             feat.unrolled_loop_extent = 1;
566         }
567     }
568 
569     *working_set += working_set_here;
570 
571     // Analyze all memory dependencies of this stage, looking
572     // through any Funcs inlined into it. This is where we track
573     // things like vector gathers.
574     int64_t bytes_loaded = 0, lines_loaded = 0, allocation_bytes_loaded = 0;
575     double num_dense_loads = 0, num_broadcasts = 0,
576            num_gathers = 0, num_stride_2_loads = 0,
577            num_stride_3_loads = 0, num_stride_4_loads = 0,
578            num_loads = 0;
579     if (innermost || at_production) {  // These are the sites at which we compute load footprints
580         // Pick the site at which we will compute the footprint relationship
581         const auto &consumer_site = sites.get(stage);
582 
583         // The store_at location of the consumer
584         const auto *consumer_store_site = innermost ? parent : consumer_site.store;
585 
586         // The parallel loop of the consumer
587         const auto *consumer_task_site = consumer_site.task;
588 
589         int64_t consumer_instances = innermost ? instances : feat.num_realizations;
590         internal_assert(consumer_instances != 0);
591 
592         vector<const FunctionDAG::Node::Stage *> pending;
593         pending.emplace_back(stage);
594         vector<pair<LoadJacobian, FunctionDAG::Node *>> jacobians;
595         set<const FunctionDAG::Node *> done;
596         while (!pending.empty()) {
597             auto p = pending.back();
598             pending.pop_back();
599             const auto &next_edges = p->incoming_edges;
600             for (const auto *e : next_edges) {
601                 internal_assert(sites.contains(&(e->producer->stages[0])))
602                     << "No site found for " << e->producer->func.name() << "\n";
603 
604                 const auto &site = sites.get(&(e->producer->stages[0]));
605 
606                 bool producer_has_been_scheduled = e->producer->is_input || (site.produce != nullptr);
607 
608                 if (innermost) {
609                     if (e->consumer == stage) {
610                         for (auto &j : e->load_jacobians) {
611                             jacobians.emplace_back(j, e->producer);
612                         }
613                     } else {
614                         // Consumer was inlined. Multiply the Jacobians to look through it.
615                         decltype(jacobians) new_jacobians;
616                         for (auto &j1 : jacobians) {
617                             if (e->consumer->node == j1.second) {
618                                 for (auto &j2 : e->load_jacobians) {
619                                     LoadJacobian j = j2 * j1.first;
620                                     new_jacobians.emplace_back(j, e->producer);
621                                 }
622                             } else {
623                                 new_jacobians.emplace_back(std::move(j1));
624                             }
625                         }
626                         jacobians.swap(new_jacobians);
627                     }
628                 }
629 
630                 if (site.inlined) {
631                     // Recursively examine the inputs
632                     pending.emplace_back(&(e->producer->stages[0]));
633                     continue;
634                 }
635 
636                 // The producer's compute_at site
637                 const auto *producer_compute_site = site.compute;
638 
639                 // The producer's store_at site
640                 const auto *producer_store_site = site.store;
641 
642                 // The region required of the producer at various sites.
643                 const auto &bounds = consumer_store_site->get_bounds(e->producer);
644                 const auto &task_bounds = consumer_task_site->get_bounds(e->producer);
645                 const auto &producer_compute_bounds = producer_compute_site->get_bounds(e->producer);
646                 const auto &producer_store_bounds = producer_store_site->get_bounds(e->producer);
647 
648                 // Compute memory footprints in terms of the
649                 // number of contiguous lines, and the number of
650                 // bytes.
651                 int64_t footprint = e->producer->bytes_per_point;
652                 int64_t compute_footprint = footprint;
653                 int64_t store_footprint = footprint;
654                 int64_t task_footprint = footprint;
655                 int64_t line_footprint = 1;
656                 int64_t compute_line_footprint = 1;
657                 int64_t store_line_footprint = 1;
658                 int64_t task_line_footprint = 1;
659 
660                 if (e->producer->is_input) {
661                     // This node represents an input. Its sites
662                     // should be at the root level.
663                     internal_assert(producer_store_site->is_root());
664                     internal_assert(producer_compute_site->is_root());
665                 }
666 
667                 if (innermost) {
668 
669                     // Grab the Jacobians that describe the memory dependence
670                     for (const auto &jac : jacobians) {
671                         if (jac.second != e->producer) continue;
672                         double n = jac.first.count();
673 
674                         // Classify them to figure out what's going on in the vector dimension.
675                         bool vector_broadcast = true;
676                         bool dense_vector_load = true;
677                         bool stride_2_vector_load = true;
678                         bool stride_3_vector_load = true;
679                         bool stride_4_vector_load = true;
680                         int producer_innermost_dim =
681                             (e->producer->is_input ? 0 :  // Assume default storage layout for inputs
682                                  !producer_has_been_scheduled ? -1 :
683                                                                 site.produce->vector_dim);
684                         if (vectorized_loop_index >= 0) {
685                             if (!producer_has_been_scheduled) {
686                                 // Operate optimistically and just
687                                 // see if *any* dimension of the
688                                 // producer would make for a good
689                                 // load.
690                                 int count[5] = {0, 0, 0, 0, 0};
691                                 for (int i = 0; i < e->producer->dimensions; i++) {
692                                     auto stride = jac.first(i, vectorized_loop_index);
693                                     // stride is a rational. Check to see if it's a small integer.
694                                     if (stride == 0) {
695                                         count[0]++;
696                                     } else if (stride == 1) {
697                                         count[1]++;
698                                     } else if (stride == 2) {
699                                         count[2]++;
700                                     } else if (stride == 3) {
701                                         count[3]++;
702                                     } else if (stride == 4) {
703                                         count[4]++;
704                                     }
705                                 }
706                                 vector_broadcast = (count[0] == e->producer->dimensions);
707                                 dense_vector_load = (count[0] == e->producer->dimensions - 1 && count[1] == 1);
708                                 stride_2_vector_load = (count[0] == e->producer->dimensions - 1 && count[2] == 1);
709                                 stride_3_vector_load = (count[0] == e->producer->dimensions - 1 && count[3] == 1);
710                                 stride_4_vector_load = (count[0] == e->producer->dimensions - 1 && count[4] == 1);
711                             } else {
712                                 for (int i = 0; i < e->producer->dimensions; i++) {
713                                     auto stride = jac.first(i, vectorized_loop_index);
714                                     vector_broadcast &= stride == 0;
715                                     if (i == producer_innermost_dim) {
716                                         dense_vector_load &= stride == 1;
717                                         stride_2_vector_load &= stride == 2;
718                                         stride_3_vector_load &= stride == 3;
719                                         stride_4_vector_load &= stride == 4;
720                                     } else {
721                                         dense_vector_load &= stride == 0;
722                                         stride_2_vector_load &= stride == 0;
723                                         stride_3_vector_load &= stride == 0;
724                                         stride_4_vector_load &= stride == 0;
725                                         // TODO: Check for strided
726                                         // loads across non-innermost
727                                         // dims, and use to count the
728                                         // number of pages, cache
729                                         // lines, cache conflict misses, etc.
730                                     }
731                                 }
732                             }
733                         }
734 
735                         // Is this load loop-invariant over an
736                         // unrolled block? If so, we amortize the
737                         // number of loads to account for
738                         // LICM. This is the key performance
739                         // optimization you get from unrolling the
740                         // inner loop of a gemm or conv, so it's
741                         // important to capture it in the
742                         // featurization.
743                         int64_t amortization = 1;
744                         if (feat.unrolled_loop_extent > 1) {
745                             for (size_t idx = 0; idx < stage->loop.size(); idx++) {
746                                 if (!stage->loop[idx].rvar) {
747                                     bool loop_invariant = true;
748                                     for (int i = 0; i < e->producer->dimensions; i++) {
749                                         if (!(jac.first(i, idx) == 0)) {
750                                             loop_invariant = false;
751                                             break;
752                                         }
753                                     }
754                                     if (loop_invariant) {
755                                         amortization *= parent->size[idx];
756                                     }
757                                 }
758                             }
759                         }
760                         n /= amortization;
761 
762                         num_loads += n;
763                         if (vector_broadcast) {
764                             num_broadcasts += n;
765                         } else if (dense_vector_load) {
766                             num_dense_loads += n;
767                         } else if (stride_2_vector_load) {
768                             num_stride_2_loads += n;
769                         } else if (stride_3_vector_load) {
770                             num_stride_3_loads += n;
771                         } else if (stride_4_vector_load) {
772                             num_stride_4_loads += n;
773                         } else {
774                             num_gathers += n;
775                         }
776                     }
777                 }
778 
779                 // Already dealt with the footprints for this producer via some other path
780                 if (done.find(e->producer) != done.end()) {
781                     continue;
782                 }
783 
784                 done.insert(e->producer);
785 
786                 // Now look at the shapes of the regions read from
787                 // the producer at various sites.
788                 int64_t max_extent = 1, max_compute_extent = 1, max_store_extent = 1, max_task_extent = 1;
789                 for (int i = 0; i < e->producer->dimensions; i++) {
790                     auto p = bounds->region_required(i);
791                     auto compute_p = producer_compute_bounds->region_computed(i);
792                     auto store_p = producer_store_bounds->region_required(i);
793                     auto task_p = task_bounds->region_required(i);
794 
795                     // Check some invariants
796                     internal_assert(store_p.min() <= store_p.max()) << store_p.min() << " " << store_p.max() << "\n";
797                     internal_assert(compute_p.min() <= compute_p.max()) << compute_p.min() << " " << compute_p.max() << "\n";
798                     internal_assert(task_p.min() <= task_p.max()) << task_p.min() << " " << task_p.max() << "\n";
799 
800                     int64_t extent = p.extent();
801                     int64_t compute_extent = compute_p.extent();
802                     int64_t store_extent = store_p.extent();
803                     int64_t task_extent = task_p.extent();
804 
805                     max_extent = std::max(extent, max_extent);
806                     max_compute_extent = std::max(compute_extent, max_compute_extent);
807                     max_store_extent = std::max(store_extent, max_store_extent);
808                     max_task_extent = std::max(task_extent, max_task_extent);
809 
810                     footprint *= extent;
811                     compute_footprint *= compute_extent;
812                     store_footprint *= store_extent;
813                     task_footprint *= task_extent;
814 
815                     bool dense = ((e->producer->is_input && i == 0) ||
816                                   (site.produce != nullptr && i == site.produce->vector_dim));
817                     if (!dense) {
818                         line_footprint *= extent;
819                         compute_line_footprint *= compute_extent;
820                         store_line_footprint *= store_extent;
821                         task_line_footprint *= task_extent;
822                     }
823                 }
824 
825                 if (!producer_has_been_scheduled) {
826                     // Optimistically assume it gets vectorized
827                     // along whatever dimension makes these
828                     // numbers the smallest.
829                     line_footprint /= max_extent;
830                     compute_line_footprint /= max_compute_extent;
831                     store_line_footprint /= max_store_extent;
832                     task_line_footprint /= max_task_extent;
833                 }
834 
835                 int64_t store_instances_per_consumption = 1;
836 
837                 if (producer_has_been_scheduled && !e->producer->is_input) {
838                     const auto &producer_feat = features->get_or_create(&(e->producer->stages[0]));
839 
840                     if (producer_feat.num_realizations) {
841                         // The producer's realization is nested inside this Func's realization
842                         const int64_t producer_store_instances = producer_feat.num_realizations;
843                         if (producer_store_instances > consumer_instances) {
844                             store_instances_per_consumption = producer_store_instances / consumer_instances;
845                         }
846                     }
847                 }
848 
849                 allocation_bytes_loaded += compute_footprint;
850 
851                 if (store_instances_per_consumption > 1) {
852                     // The producer is nested inside the consumer
853                     bytes_loaded += store_footprint;
854                     // Due to folding, the actual buffer size is smaller than the bounds at the store level
855                     lines_loaded += store_line_footprint;
856                 } else {
857                     // The consumer is consuming some portion of a larger producer computed earlier
858                     bytes_loaded += footprint;
859                     lines_loaded += line_footprint;
860                 }
861 
862                 // We compute (but never use) these; computing them is cheap,
863                 // so let's leave in for future reference, but mark as 'ignore me'
864                 // to avoid clang-tidy warnings.
865                 (void)compute_line_footprint;
866                 (void)task_line_footprint;
867             }
868         }
869     }
870 
871     if (at_production) {
872         // Properties of the realization, but the values are
873         // computable at the production site because that's where
874         // the consumers are.
875         internal_assert(bytes_loaded >= 0) << "Negative bytes loaded: " << bytes_loaded << "\n";
876         feat.allocation_bytes_read_per_realization = allocation_bytes_loaded;
877         feat.unique_bytes_read_per_realization = bytes_loaded;
878         feat.unique_lines_read_per_realization = lines_loaded;
879 
880         if (!at_pure_production) {
881             // Also pessimistically assume this update definition relies on the entirety of the produced region so far.
882             // TODO: This overbills scatters, or writes to a sub-window.
883             internal_assert(bytes_loaded >= 0) << "Negative bytes at production: " << feat.bytes_at_production << "\n";
884             feat.unique_bytes_read_per_realization += feat.bytes_at_production;
885             feat.unique_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production;
886             feat.allocation_bytes_read_per_realization += feat.bytes_at_production;
887         }
888     }
889 
890     if (innermost) {
891         feat.points_computed_per_production = subinstances / feat.num_productions;
892         // Halide codegens strided loads for small strides as a
893         // large dense vector load and a cheap swizzle. ARM even
894         // has instructions that do this for free on load
895         // (e.g. vld4).
896         feat.vector_loads_per_vector = (num_dense_loads +
897                                         2 * num_stride_2_loads +
898                                         3 * num_stride_3_loads +
899                                         4 * num_stride_4_loads);
900         feat.scalar_loads_per_vector = num_broadcasts + feat.vector_size * num_gathers;
901         feat.scalar_loads_per_scalar = num_loads;
902         if (stage->index > 0) {
903             // Assume at update definitions we do a self-load
904             feat.vector_loads_per_vector++;
905             feat.scalar_loads_per_scalar++;
906         }
907         feat.unique_bytes_read_per_vector = bytes_loaded;
908         feat.unique_lines_read_per_vector = lines_loaded;
909     }
910 
911     // Track features for inlined Funcs
912     for (auto it = inlined.begin(); it != inlined.end(); it++) {
913         const auto *f = it.key();
914         internal_assert(f);
915         auto &inlined_feat = features->get_or_create(&(f->stages[0]));
916         inlined_feat.inlined_calls += it.value() * subinstances;
917         inlined_feat.num_vectors += it.value() * feat.num_vectors;
918         inlined_feat.num_scalars += it.value() * feat.num_scalars;
919         inlined_feat.native_vector_size = stage->vector_size;
920         if (inlined_feat.vector_size > 0) {
921             inlined_feat.vector_size = std::min(inlined_feat.vector_size, (double)stage->vector_size);
922         } else {
923             inlined_feat.vector_size = feat.vector_size;
924         }
925         if (inlined_feat.innermost_pure_loop_extent > 0) {
926             inlined_feat.innermost_pure_loop_extent =
927                 std::min(inlined_feat.innermost_pure_loop_extent,
928                          feat.innermost_pure_loop_extent);
929         } else {
930             inlined_feat.innermost_pure_loop_extent = feat.innermost_pure_loop_extent;
931         }
932         inlined_feat.inner_parallelism = 1;
933         inlined_feat.outer_parallelism = parallelism;
934     }
935 }
936 
937 // Get the region required of a Func at this site, from which we
938 // know what region would be computed if it were scheduled here,
939 // and what its loop nest would be.
get_bounds(const FunctionDAG::Node * f) const940 const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const {
941     if (bounds.contains(f)) {
942         const Bound &b = bounds.get(f);
943         // Expensive validation for debugging
944         // b->validate();
945         return b;
946     }
947     auto bound = f->make_bound();
948 
949     // Compute the region required
950     if (f->is_output && is_root()) {
951         internal_assert(f->outgoing_edges.empty()) << "Outputs that access other outputs not yet supported\n";
952         // It's an output. Use the bounds estimate.
953         for (int i = 0; i < f->dimensions; i++) {
954             bound->region_required(i) = f->estimated_region_required[i];
955         }
956     } else {
957         internal_assert(!f->outgoing_edges.empty())
958             << "No consumers of " << f->func.name()
959             << " at loop over " << (is_root() ? "root" : node->func.name()) << "\n";
960         auto init = Span::empty_span();
961         for (int i = 0; i < f->dimensions; i++) {
962             bound->region_required(i) = init;
963         }
964 
965         for (const auto *e : f->outgoing_edges) {
966             // Ignore consumers outside of this loop nest
967             if (!is_root() &&
968                 (stage != e->consumer) &&
969                 !stage->downstream_of(*(e->consumer->node))) {
970                 continue;
971             }
972             const auto &c_bounds = get_bounds(e->consumer->node);
973 
974             // Get the concrete sizes of the consuming loop
975             const auto *consumer_loop = &(c_bounds->loops(e->consumer->index, 0));
976 
977             // Use the bounds relationship between the nodes to
978             // map from the consumer's loop to the required region
979             // of the producer.
980             e->expand_footprint(consumer_loop, &(bound->region_required(0)));
981         }
982     }
983 
984     // Given a required region of this producer, use the bounds
985     // analysis to figure out what region actually gets
986     // computed. For most funcs, these are the same. Some things,
987     // like histograms or scans, you can only really compute all
988     // of at once.
989     f->required_to_computed(&(bound->region_required(0)), &(bound->region_computed(0)));
990 
991     // Finally, figure out what loop nests will be used to compute
992     // this region.
993     for (int i = 0; i < (int)f->stages.size(); i++) {
994         f->loop_nest_for_region(i, &(bound->region_computed(0)), &(bound->loops(i, 0)));
995     }
996 
997     const Bound &b = set_bounds(f, bound);
998     // Validation is expensive, turn if off by default.
999     // b->validate();
1000     return b;
1001 }
1002 
1003 // Recursively print a loop nest representation to stderr
dump(string prefix,const LoopNest * parent) const1004 void LoopNest::dump(string prefix, const LoopNest *parent) const {
1005     if (!is_root()) {
1006         // Non-root nodes always have parents.
1007         internal_assert(parent != nullptr);
1008 
1009         aslog(0) << prefix << node->func.name();
1010         prefix += " ";
1011 
1012         for (size_t i = 0; i < size.size(); i++) {
1013             aslog(0) << " " << size[i];
1014             // The vectorized loop gets a 'v' suffix
1015             if (innermost && i == (size_t)vectorized_loop_index) {
1016                 aslog(0) << "v";
1017             }
1018             // Loops that have a known constant size get a
1019             // 'c'. Useful for knowing what we can unroll.
1020             if (parent->get_bounds(node)->loops(stage->index, i).constant_extent()) {
1021                 aslog(0) << "c";
1022             }
1023         }
1024 
1025         // Uncomment when debugging the representative loop bounds selected.
1026         /*
1027             const auto &bounds = get_bounds(node);
1028             for (size_t i = 0; i < size.size(); i++) {
1029                 const auto &p = bounds->loops(stage->index, i);
1030                 aslog(0) << " [" << p.first << ", " << p.second << "]";
1031             }
1032         */
1033 
1034         aslog(0) << " (" << vectorized_loop_index << ", " << vector_dim << ")";
1035     }
1036 
1037     if (tileable) {
1038         aslog(0) << " t";
1039     }
1040     if (innermost) {
1041         aslog(0) << " *\n";
1042     } else if (parallel) {
1043         aslog(0) << " p\n";
1044     } else {
1045         aslog(0) << "\n";
1046     }
1047     for (auto p : store_at) {
1048         aslog(0) << prefix << "realize: " << p->func.name() << "\n";
1049     }
1050     for (size_t i = children.size(); i > 0; i--) {
1051         children[i - 1]->dump(prefix, this);
1052     }
1053     for (auto it = inlined.begin(); it != inlined.end(); it++) {
1054         aslog(0) << prefix << "inlined: " << it.key()->func.name() << " " << it.value() << "\n";
1055     }
1056 }
1057 
1058 // Does this loop nest access the given Func
calls(const FunctionDAG::Node * f) const1059 bool LoopNest::calls(const FunctionDAG::Node *f) const {
1060     for (const auto &c : children) {
1061         if (c->calls(f)) return true;
1062     }
1063     for (const auto *e : f->outgoing_edges) {
1064         if (e->consumer == stage) {
1065             return true;
1066         }
1067         if (inlined.contains(e->consumer->node)) {
1068             return true;
1069         }
1070     }
1071     return false;
1072 }
1073 
1074 // What is the maximum number of inlined calls to a Func that
1075 // occur within this loop. Used to prune states that would
1076 // generate too much code.
max_inlined_calls() const1077 int64_t LoopNest::max_inlined_calls() const {
1078     int64_t result = 0;
1079     for (auto it = inlined.begin(); it != inlined.end(); it++) {
1080         result = std::max(result, it.value());
1081     }
1082     for (const auto &c : children) {
1083         result = std::max(result, c->max_inlined_calls());
1084     }
1085     return result;
1086 }
1087 
1088 // Does this loop nest access an input buffer? Used to select
1089 // trail strategies when splitting loops. We don't want to read
1090 // out of bounds on inputs, even if we don't intend to use the
1091 // values read. It could create annoying assertion failures for
1092 // the user. It's OK to read out of range of the values computed
1093 // on internal Funcs though. Allocation bounds inference just pads
1094 // out the bounds so that it won't fault.
accesses_input_buffer() const1095 bool LoopNest::accesses_input_buffer() const {
1096     for (const auto &c : children) {
1097         if (c->accesses_input_buffer()) return true;
1098     }
1099     if (is_root()) return false;
1100 
1101     auto check = [&](const FunctionDAG::Node::Stage *s) {
1102         for (const auto *e : s->incoming_edges) {
1103             if (e->producer->is_input) return true;
1104         }
1105 
1106         for (int t = 0; t < (int)PipelineFeatures::ScalarType::NumScalarTypes; t++) {
1107             if (s->features.op_histogram[(int)PipelineFeatures::OpType::ImageCall][t] > 0) return true;
1108         }
1109         return false;
1110     };
1111 
1112     if (check(stage)) return true;
1113     for (auto it = inlined.begin(); it != inlined.end(); it++) {
1114         if (check(&(it.key()->stages[0]))) return true;
1115     }
1116     return false;
1117 }
1118 
1119 // Does this loop nest contain a computation of the given Func.
computes(const FunctionDAG::Node * f) const1120 bool LoopNest::computes(const FunctionDAG::Node *f) const {
1121     if (f == node) {
1122         return true;
1123     }
1124     if (inlined.contains(f)) {
1125         return true;
1126     }
1127     for (const auto &c : children) {
1128         if (c->computes(f)) return true;
1129     }
1130     return false;
1131 }
1132 
1133 // Above here most methods query the loop nest. Below we have
1134 // methods that mutate the loop nest.
1135 
1136 // Inline a Func into all consumers within this loop.
inline_func(const FunctionDAG::Node * f)1137 void LoopNest::inline_func(const FunctionDAG::Node *f) {
1138     // Inline it into the children
1139     for (size_t i = 0; i < children.size(); i++) {
1140         if (children[i]->calls(f)) {
1141             std::unique_ptr<LoopNest> new_child{new LoopNest};
1142             new_child->copy_from(*children[i]);
1143             new_child->inline_func(f);
1144             children[i] = new_child.release();
1145         }
1146     }
1147 
1148     // Inline it here if there are any direct calls
1149     if (innermost) {
1150         int64_t calls = 0;
1151         for (const auto *e : f->outgoing_edges) {
1152             if (inlined.contains(e->consumer->node)) {
1153                 calls += inlined.get(e->consumer->node) * e->calls;
1154             }
1155             if (e->consumer == stage) {
1156                 calls += e->calls;
1157             }
1158         }
1159         if (calls) {
1160             inlined.insert(f, calls);
1161         }
1162     }
1163 }
1164 
1165 // Compute a Func at this site.
compute_here(const FunctionDAG::Node * f,bool tileable,int v)1166 void LoopNest::compute_here(const FunctionDAG::Node *f, bool tileable, int v) {
1167     const auto &bounds = get_bounds(f);
1168 
1169     if (!may_subtile()) {
1170         // If we are restricting ourselves to the Mullapudi et al
1171         // scheduling space, then once something is computed here
1172         // we may not subtile this loop.
1173         this->tileable = false;
1174     }
1175 
1176     for (int s = (int)f->stages.size() - 1; s >= 0; s--) {
1177         LoopNest *node = new LoopNest;
1178         node->node = f;
1179         node->stage = &f->stages[s];
1180         node->innermost = true;
1181         node->vectorized_loop_index = -1;
1182         node->tileable = tileable && (is_root() || may_subtile());
1183         // Set up a bound for the inside of the
1184         // loop. computed/required is still the full region, but
1185         // the loop nest will be a single representative point.
1186         auto single_point = bounds->make_copy();
1187         size_t loop_dim = f->stages[s].loop.size();
1188         node->size.resize(loop_dim);
1189 
1190         int64_t total_extent = 1;
1191         int64_t vector_size = 1;
1192         for (size_t i = 0; i < loop_dim; i++) {
1193             const auto &l = bounds->loops(s, i);
1194             // Initialize the loop nest
1195             node->size[i] = l.extent();
1196             total_extent *= node->size[i];
1197 
1198             // Use the first loop iteration to represent the inner
1199             // loop. We'll shift it to a later one once we decide
1200             // on vectorization.
1201             single_point->loops(s, i) = Span(l.min(), l.min(), true);
1202 
1203             internal_assert(l.max() >= l.min()) << i << " " << l.max() << " " << l.min() << "\n";
1204 
1205             if (f->dimensions &&
1206                 node->size[i] >= 1 &&
1207                 f->stages[s].loop[i].var == f->func.args()[v]) {
1208                 node->vectorized_loop_index = (int)i;
1209                 vector_size = (int64_t)(node->stage->vector_size);
1210                 single_point->loops(s, i).set_extent(vector_size);
1211                 node->size[i] += vector_size - 1;
1212                 node->size[i] /= vector_size;
1213 
1214                 // Shift the loops along by some multiple of the
1215                 // vector size, to pick a more representative vector
1216                 // than the first. We use the middle-most.
1217                 int64_t shift = vector_size * (node->size[i] / 2);
1218                 single_point->loops(s, i).translate(shift);
1219             } else {
1220                 int64_t shift = node->size[i] / 2;
1221                 single_point->loops(s, i).translate(shift);
1222             }
1223         }
1224 
1225         // Leave region required blank inside the computation of a Func
1226         node->set_bounds(f, std::move(single_point));
1227         node->vector_dim = v;
1228 
1229         if (node->vectorized_loop_index >= 0) {
1230             // Split off the single vector as an inner loop nest.
1231             node->innermost = false;
1232 
1233             LoopNest *one_vector = new LoopNest;
1234             one_vector->node = node->node;
1235             one_vector->stage = node->stage;
1236             one_vector->tileable = false;
1237             one_vector->vectorized_loop_index = node->vectorized_loop_index;
1238             one_vector->vector_dim = v;
1239             one_vector->size.resize(loop_dim, 1);
1240             one_vector->innermost = true;
1241             auto b = node->get_bounds(f)->make_copy();
1242             // Set the region computed inside this node to be the first vector lane
1243             b->loops(s, node->vectorized_loop_index).set_extent(1);
1244             one_vector->set_bounds(f, b);
1245             one_vector->size[node->vectorized_loop_index] = vector_size;
1246 
1247             node->children.emplace_back(one_vector);
1248         }
1249         children.emplace_back(node);
1250     }
1251 }
1252 
1253 // Parallelize this loop according to the given tiling.
parallelize_in_tiles(const MachineParams & params,const vector<int64_t> & tiling,const LoopNest * parent) const1254 IntrusivePtr<const LoopNest> LoopNest::parallelize_in_tiles(const MachineParams &params,
1255                                                             const vector<int64_t> &tiling,
1256                                                             const LoopNest *parent) const {
1257 
1258     // Split this loop and move factors to the inner loop
1259     LoopNest *inner = new LoopNest, *outer = new LoopNest;
1260     inner->node = outer->node = node;
1261     inner->stage = outer->stage = stage;
1262     inner->tileable = outer->tileable = tileable && may_subtile();
1263     inner->vector_dim = outer->vector_dim = vector_dim;
1264     inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index;
1265     outer->size = size;
1266     outer->innermost = false;
1267     outer->parallel = true;
1268     outer->tileable = may_subtile();
1269 
1270     // First make an inner loop representing a 1x1x1... tile
1271     inner->size.resize(size.size(), 1);
1272     inner->innermost = innermost;
1273     inner->children = children;
1274     inner->inlined = inlined;
1275     inner->bounds = bounds;
1276     inner->store_at = store_at;
1277 
1278     auto b = inner->get_bounds(node)->make_copy();
1279 
1280     // Then move factors from the outer loop to the inner loop
1281     auto parent_bounds = parent->get_bounds(node);
1282 
1283     for (size_t i = 0; i < stage->loop.size(); i++) {
1284         int l = stage->loop[i].pure_dim;
1285 
1286         int64_t outer_extent;
1287         if (l >= 0) {
1288             internal_assert(l < (int)tiling.size()) << l << " " << tiling.size() << "\n";
1289             outer_extent = tiling[l];
1290         } else {
1291             // RVars are moved inwards
1292             outer_extent = 1;
1293         }
1294 
1295         inner->size[i] = (outer->size[i] + outer_extent - 1) / outer_extent;
1296 
1297         // Recompute the outer size given the selected inner size
1298         outer_extent = (outer->size[i] + inner->size[i] - 1) / inner->size[i];
1299 
1300         outer->size[i] = outer_extent;
1301         const auto &p = parent_bounds->loops(stage->index, i);
1302         int64_t min = p.min();
1303         int64_t extent = p.extent();
1304         extent = (extent + outer_extent - 1) / outer_extent;
1305 
1306         // Pick a better representative loop iteration for the
1307         // inner loops.
1308         min += (outer_extent / 2) * extent;
1309         bool compile_time_constant_bounds = p.constant_extent() || ((outer_extent > 1) && stage->loop[i].pure);
1310         b->loops(stage->index, i) = Span(min, min + extent - 1, compile_time_constant_bounds);
1311     }
1312     outer->set_bounds(node, b);
1313 
1314     outer->children.emplace_back(inner);
1315     return outer;
1316 }
1317 
1318 // Return all possible ways to compute f in tiles somewhere within
1319 // this loop nest.
compute_in_tiles(const FunctionDAG::Node * f,const LoopNest * parent,const MachineParams & params,int v,bool in_realization) const1320 vector<IntrusivePtr<const LoopNest>> LoopNest::compute_in_tiles(const FunctionDAG::Node *f,
1321                                                                 const LoopNest *parent,
1322                                                                 const MachineParams &params,
1323                                                                 int v,
1324                                                                 bool in_realization) const {
1325     internal_assert(f);
1326 
1327     vector<IntrusivePtr<const LoopNest>> result;
1328 
1329     // Some pruning to not waste time on terrible states
1330     if (parent) {
1331         const auto &bounds_here = get_bounds(f);
1332         const auto &bounds_at_parent = parent->get_bounds(f);
1333 
1334         // Don't descend into loops that break our ability to
1335         // vectorize if we could have vectorized one level up.
1336         const auto &p = bounds_here->region_computed(v);
1337         const auto &p_parent = bounds_at_parent->region_computed(v);
1338         int64_t e = p.extent();
1339         int64_t ep = p_parent.extent();
1340         if (ep >= f->vector_size && e < f->vector_size) return result;
1341 
1342         // Don't descend into loops if the bounds required don't
1343         // shrink.
1344         int64_t total_here = 1, total_at_parent = 1;
1345         for (int i = 0; i < f->dimensions; i++) {
1346             const auto &range_here = bounds_here->region_computed(i);
1347             const auto &range_at_parent = bounds_at_parent->region_computed(i);
1348             total_here *= range_here.extent();
1349             total_at_parent *= range_at_parent.extent();
1350         }
1351         if (total_here >= total_at_parent) return result;
1352     }
1353 
1354     // Figure out which child we can fuse this into
1355     int child = -1;
1356     bool called_by_multiple_children = false;
1357     for (int i = 0; i < (int)children.size(); i++) {
1358         if (children[i]->calls(f)) {
1359             if (child != -1) {
1360                 called_by_multiple_children = true;
1361             }
1362             child = i;
1363         }
1364     }
1365 
1366     // Place the computation directly inside this loop (provided it's not a SIMD loop)
1367     if (!innermost &&
1368         (!in_realization ||
1369          size.empty() ||
1370          vector_dim == -1 ||
1371          size[vector_dim] == 1)) {
1372 
1373         std::unique_ptr<LoopNest> r{new LoopNest};
1374         r->copy_from(*this);
1375         r->compute_here(f, true, v);
1376         if (!in_realization) {
1377             r->store_at.insert(f);
1378         } else {
1379             r->tileable = false;
1380         }
1381         result.emplace_back(r.release());
1382     }
1383 
1384     if (f->is_output) {
1385         // Outputs must be compute_root, so we're done.
1386         return result;
1387     }
1388 
1389     if (tileable) {
1390         // The root node is not tileable, so all tileable nodes have parents.
1391         internal_assert(parent != nullptr);
1392 
1393         // Generate a list of tile sizes to try
1394         auto tilings = generate_tilings(size, (int)(size.size() - 1), 2, !in_realization);
1395 
1396         if (tilings.size() > 10000) {
1397             aslog(0) << "Warning: lots of tilings: " << tilings.size() << "\n";
1398         }
1399 
1400         for (auto t : tilings) {
1401             if (parallel) {
1402                 const auto &l = stage->loop;
1403                 // More pruning. Skip root-level tilings that
1404                 // would leave too many cores idle, and root-level
1405                 // tilings that would force serialization of
1406                 // dimensions we have decided to parallelize over
1407                 // in an earlier pass.
1408                 int total = 1;
1409                 size_t idx = 0;
1410                 for (auto s : t) {
1411                     if (l[idx].pure) {
1412                         total *= s;
1413                     }
1414                     idx++;
1415                 }
1416 
1417                 const double tasks_per_core = (double)total / params.parallelism;
1418                 const double idle_cores = std::ceil(tasks_per_core) / tasks_per_core;
1419                 if (idle_cores > 1.1) continue;
1420             }
1421 
1422             // Tile this loop and place the computation at some coarser granularity
1423             LoopNest *inner = new LoopNest, *outer = new LoopNest;
1424             inner->node = outer->node = node;
1425             inner->stage = outer->stage = stage;
1426             inner->tileable = outer->tileable = tileable && may_subtile();
1427             inner->vector_dim = outer->vector_dim = vector_dim;
1428             inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index;
1429             outer->size = size;
1430             outer->innermost = false;
1431             outer->parallel = parallel;
1432             inner->parallel = false;
1433 
1434             // First make an inner loop representing a 1x1x1... tile
1435             inner->size.resize(size.size(), 1);
1436             inner->innermost = innermost;
1437             inner->children = children;
1438             inner->inlined = inlined;
1439             inner->bounds = bounds;
1440             inner->store_at = store_at;
1441 
1442             {
1443                 auto b = inner->get_bounds(node)->make_copy();
1444 
1445                 // Then move factors from the outer loop to the inner loop
1446                 auto parent_bounds = parent->get_bounds(node);
1447 
1448                 for (size_t i = 0; i < t.size(); i++) {
1449                     int64_t outer_extent = t[i];
1450                     inner->size[i] = (outer->size[i] + outer_extent - 1) / outer_extent;
1451                     outer->size[i] = outer_extent;
1452                     const auto &p = parent_bounds->loops(stage->index, i);
1453                     int64_t min = p.min();
1454                     int64_t original_extent = p.extent();
1455                     int64_t inner_extent = (original_extent + outer_extent - 1) / outer_extent;
1456                     // Pick a more representative loop iteration
1457                     min += (outer_extent / 2) * inner_extent;
1458                     bool compile_time_constant_extent =
1459                         (p.constant_extent() || outer_extent > 1) &&
1460                         (inner_extent == 1 || outer_extent == 1 || stage->index == 0);
1461                     b->loops(stage->index, i) = Span(min, min + inner_extent - 1, compile_time_constant_extent);
1462                 }
1463 
1464                 // Region_{computed/required} on outer is now
1465                 // wrong, but it doesn't matter because consumers
1466                 // only look at the loops in get_bounds. Still,
1467                 // this is weird.
1468                 outer->set_bounds(node, b);
1469             }
1470 
1471             if (!in_realization) {
1472                 outer->store_at.insert(f);
1473             }
1474             outer->children.emplace_back(inner);
1475 
1476             // HACK
1477             // bool may_slide = false;
1478             bool may_slide = (!in_realization &&
1479                               f->stages.size() == 1);
1480             if (may_slide) {
1481                 // Store here, but compute further in. Currently
1482                 // don't have to worry about the constraints this
1483                 // places on parallelism, as we forced all the
1484                 // parallelism to the outer loop.
1485                 auto opts = inner->compute_in_tiles(f, outer, params, v, true);
1486                 for (IntrusivePtr<const LoopNest> &n : opts) {
1487                     LoopNest *store_at_outer_compute_further_in = new LoopNest;
1488                     store_at_outer_compute_further_in->copy_from(*outer);
1489                     store_at_outer_compute_further_in->children.pop_back();
1490                     store_at_outer_compute_further_in->children.emplace_back(std::move(n));
1491                     result.emplace_back(store_at_outer_compute_further_in);
1492                 }
1493             }
1494 
1495             // Site the computation inside the outer loop
1496             outer->compute_here(f, true, v);
1497             outer->tileable &= !in_realization;
1498             result.emplace_back(outer);
1499         }
1500     }
1501 
1502     if (child >= 0 && !called_by_multiple_children && !in_realization &&
1503         (may_subtile() || is_root())) {
1504         // Push the Func further inwards in the loop nest
1505 
1506         // See if it's appropriate to slide over this loop Can't
1507         // slide at the root level if we intend to parallelize it.
1508         bool may_slide = (params.parallelism == 1) || !is_root();
1509 
1510         const auto &c = children[child];
1511         int num_ones = 0;
1512         for (size_t i = 0; i < c->size.size(); i++) {
1513             int64_t s = c->size[i];
1514             num_ones += (s == 1) ? 1 : 0;
1515         }
1516 
1517         // Some pruning:
1518 
1519         // Only slide over single-dimensional loops
1520         may_slide &= num_ones == ((int)c->size.size() - 1);
1521 
1522         // Don't slide funcs with update stages
1523         may_slide &= f->stages.size() == 1;
1524 
1525         // Don't slide over the vector dimension
1526         may_slide &= (c->vectorized_loop_index == -1 ||
1527                       c->size[c->vectorized_loop_index] == 1);
1528 
1529         for (int store_here = 0; store_here < 2; store_here++) {
1530             if (store_here && !may_slide) {
1531                 // We place all our parallel loops at the root
1532                 // level, so this would constrain parallelism.
1533                 continue;
1534             }
1535             if (is_root() && num_ones == (int)c->size.size() && params.parallelism > 1) {
1536                 // Don't fuse into serial loops, or we could never parallelize this Func.
1537                 continue;
1538             }
1539             auto opts = children[child]->compute_in_tiles(f, this, params, v, store_here);
1540             for (IntrusivePtr<const LoopNest> &n : opts) {
1541                 // (Only valid if one child calls f) Push the
1542                 // computation into the child. Possibly leaving
1543                 // the storage out here.
1544                 LoopNest *r = new LoopNest;
1545                 r->copy_from(*this);
1546                 if (store_here) {
1547                     r->store_at.insert(f);
1548                 }
1549                 r->children[child] = n;
1550                 result.emplace_back(r);
1551             }
1552         }
1553     }
1554 
1555     return result;
1556 }
1557 
1558 // Apply the schedule represented by this loop nest to a Halide pipeline.
apply(LoopLevel here,StageMap<std::unique_ptr<StageScheduleState>> & state_map,double num_cores,int depth,const LoopNest * parent,const LoopNest * compute_site) const1559 void LoopNest::apply(LoopLevel here,
1560                      StageMap<std::unique_ptr<StageScheduleState>> &state_map,
1561                      double num_cores,
1562                      int depth,
1563                      const LoopNest *parent,
1564                      const LoopNest *compute_site) const {
1565     if (is_root()) {
1566         for (auto &c : children) {
1567             Func(c->node->func).compute_root();
1568             c->apply(LoopLevel::root(), state_map, num_cores, 1, this, c.get());
1569             if (c->stage->index == 0) {
1570                 auto &state = state_map.get(c->stage);
1571                 state->schedule_source << "\n    .compute_root()";
1572                 // TODO: Omitting logic for printing store_root() assumes everything store_root is also compute root
1573             }
1574         }
1575     } else {
1576         // Non-root nodes always have parents.
1577         internal_assert(parent != nullptr);
1578 
1579         if (parent->node != node) {
1580             compute_site = this;
1581         }
1582 
1583         const auto &symbolic_loop = stage->loop;
1584         const auto &parent_bounds = parent->get_bounds(node);
1585         if (!state_map.contains(stage)) {
1586             StageScheduleState *state = new StageScheduleState;
1587             state->num_cores = num_cores;
1588             state->vector_dim = vector_dim;
1589             state->vectorized_loop_index = vectorized_loop_index;
1590             for (size_t i = 0; i < symbolic_loop.size(); i++) {
1591                 StageScheduleState::FuncVar fv;
1592                 const auto &l = symbolic_loop[i];
1593                 fv.var = VarOrRVar(l.var, !l.pure);
1594                 fv.orig = fv.var;
1595                 fv.accessor = l.accessor;
1596                 const auto &p = parent_bounds->loops(stage->index, i);
1597                 fv.extent = p.extent();
1598                 fv.constant_extent = p.constant_extent();
1599                 fv.outermost = true;
1600                 fv.parallel = l.pure && parallel;
1601                 fv.exists = true;
1602                 fv.pure = l.pure;
1603                 fv.index = i;
1604                 fv.innermost_pure_dim = (i == (size_t)vectorized_loop_index);
1605                 state->vars.push_back(fv);
1606             }
1607             // Bubble the innermost pure dimension to the front of the pure dimensions
1608             for (int i = vectorized_loop_index - 1;
1609                  i >= 0 && state->vars[i].pure; i--) {
1610                 std::swap(state->vars[i], state->vars[i + 1]);
1611             }
1612             state_map.emplace(stage, std::unique_ptr<StageScheduleState>(state));
1613         }
1614         auto &state = *(state_map.get(stage));
1615 
1616         // The getter for grabbing Func handles is reverse topological order
1617         Stage s = Func(node->func);
1618         if (stage->index > 0) {
1619             s = Func(node->func).update(stage->index - 1);
1620         }
1621 
1622         if (stage->index == 0 && parent->node != node) {
1623             // Pick a memory type
1624             double bytes = node->bytes_per_point;
1625             for (int i = 0; i < node->dimensions; i++) {
1626                 const auto &p = parent_bounds->region_computed(i);
1627                 bytes *= p.extent();
1628             }
1629             if (bytes < 64000 && depth > 2) {
1630                 // If it's probably a small allocation, and it's
1631                 // made more than once, use stack-scoped
1632                 // storage. Otherwise let the compiler pick heap
1633                 // or stack as it likes.
1634                 Func(node->func).store_in(MemoryType::Stack);
1635                 state.schedule_source << "\n    .store_in(MemoryType::Stack)";
1636             }
1637         }
1638 
1639         // Pick a tail strategy for any splits of pure vars. RVars always use guardwithif
1640         auto pure_var_tail_strategy = TailStrategy::Auto;
1641         if (!compute_site->accesses_input_buffer() && !node->is_output) {
1642             // Roundup is lowest overhead, provided it doesn't
1643             // expand the bounds read on the input or written on
1644             // the output. However, you can only really use it on
1645             // pure stages that don't access the input anywhere in
1646             // their loop nest.
1647             pure_var_tail_strategy = TailStrategy::RoundUp;
1648         } else if (stage->index == 0) {
1649             // Pure stages that access the input use shiftinwards
1650             pure_var_tail_strategy = TailStrategy::ShiftInwards;
1651         } else {
1652             // For pure vars in update stages that access the
1653             // input, it's not safe to round up or redundantly
1654             // recompute
1655             pure_var_tail_strategy = TailStrategy::GuardWithIf;
1656         }
1657 
1658         if (!size.empty()) {
1659             if (innermost) {
1660                 if (vectorized_loop_index >= 0) {
1661                     size_t i = 0;
1662                     while (!state.vars[i].innermost_pure_dim)
1663                         i++;
1664                     auto &v = state.vars[i];
1665                     internal_assert(v.innermost_pure_dim && v.exists) << v.var.name() << "\n";
1666                     // Is the result of a split
1667                     state.schedule_source
1668                         << "\n    .vectorize(" << v.var.name() << ")";
1669                     s.vectorize(v.var);
1670                 }
1671             } else {
1672                 // Grab the innermost loop for this node
1673                 const LoopNest *innermost_loop = this, *child = nullptr;
1674                 while (!innermost_loop->innermost) {
1675                     for (const auto &c : innermost_loop->children) {
1676                         if (c->node == node) {
1677                             if (!child) {
1678                                 child = c.get();
1679                             }
1680                             innermost_loop = c.get();
1681                             break;
1682                         }
1683                     }
1684                 }
1685 
1686                 // Do the implied splits
1687                 vector<StageScheduleState::FuncVar> new_inner;
1688                 for (size_t i = 0; i < symbolic_loop.size(); i++) {
1689                     StageScheduleState::FuncVar v;
1690                     StageScheduleState::FuncVar &parent = state.vars[i];
1691 
1692                     int64_t factor = (parent.extent + size[parent.index] - 1) / size[parent.index];
1693                     int64_t innermost_size = innermost_loop->size[parent.index];
1694 
1695                     if (child && parent.innermost_pure_dim) {
1696                         // Ensure the split is a multiple of the
1697                         // vector size. With all these rounded
1698                         // divs going on it can drift.
1699                         factor = ((factor + innermost_size - 1) / innermost_size) * innermost_size;
1700                     }
1701 
1702                     if (child && innermost_size > factor) {
1703                         factor = innermost_size;
1704                     }
1705 
1706                     if (!parent.exists || factor == 1) {
1707                         v.exists = false;
1708                         v.extent = 1;
1709                     } else if (size[parent.index] == 1 && !(child &&
1710                                                             child->innermost &&
1711                                                             parent.innermost_pure_dim &&
1712                                                             parent.var.name() == parent.orig.name())) {
1713                         // Not split in this dimension
1714                         v = parent;
1715                         v.parallel = false;
1716                         parent.exists = false;
1717                         parent.extent = 1;
1718                     } else {
1719                         VarOrRVar inner(Var(parent.var.name() + "i"));
1720                         if (parent.var.is_rvar) {
1721                             inner = RVar(parent.var.name() + "i");
1722                         }
1723 
1724                         auto tail_strategy = pure_var_tail_strategy;
1725                         // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if instead.
1726                         if (parent.var.is_rvar || (stage->index != 0 && !parent.outermost)) {
1727                             tail_strategy = TailStrategy::GuardWithIf;
1728                         }
1729 
1730                         if (factor > parent.extent && tail_strategy == TailStrategy::ShiftInwards) {
1731                             // Don't shift all the way off the image.
1732                             tail_strategy = TailStrategy::GuardWithIf;
1733                         }
1734 
1735                         s.split(parent.var, parent.var, inner, (int)factor, tail_strategy);
1736                         state.schedule_source
1737                             << "\n    .split("
1738                             << parent.var.name() << ", "
1739                             << parent.var.name() << ", "
1740                             << inner.name() << ", "
1741                             << factor << ", "
1742                             << "TailStrategy::" << tail_strategy << ")";
1743                         v = parent;
1744                         parent.extent = size[parent.index];
1745                         v.constant_extent = (tail_strategy != TailStrategy::GuardWithIf);
1746                         v.var = inner;
1747                         v.accessor.clear();
1748                         v.extent = factor;
1749                         v.parallel = false;
1750                         v.outermost = false;
1751                     }
1752                     new_inner.push_back(v);
1753                 }
1754 
1755                 if (child->innermost) {
1756                     // Maybe do some unrolling
1757 
1758                     int64_t product_of_pure_loops = 1;
1759                     bool all_pure_loops_constant_size = true;
1760                     for (size_t i = 0; i < symbolic_loop.size(); i++) {
1761                         if (state.vars[i].pure) {
1762                             product_of_pure_loops *= state.vars[i].extent;
1763                             all_pure_loops_constant_size &= state.vars[i].constant_extent;
1764                         }
1765                     }
1766 
1767                     if (product_of_pure_loops <= kUnrollLimit && all_pure_loops_constant_size) {
1768                         // There's a hope we can fit anything compute-at this level into registers if we fully unroll
1769                         // TODO: 16 should be the number of vector registers in the architecture
1770                         std::stable_sort(state.vars.begin(), state.vars.begin() + symbolic_loop.size(),
1771                                          [](const StageScheduleState::FuncVar &a, const StageScheduleState::FuncVar &b) {
1772                                              return a.pure && !b.pure;
1773                                          });
1774 
1775                         for (size_t i = 0; i < symbolic_loop.size(); i++) {
1776                             if (state.vars[i].pure && state.vars[i].exists && state.vars[i].extent > 1) {
1777                                 s.unroll(state.vars[i].var);
1778                                 state.schedule_source << "\n    .unroll(" << state.vars[i].var.name() << ")";
1779                             }
1780                         }
1781                     }
1782                 }
1783 
1784                 bool found = false;
1785                 for (const auto &v : state.vars) {
1786                     if (!v.exists) continue;
1787                     here = LoopLevel(node->func, v.var);
1788                     found = true;
1789                     break;
1790                 }
1791                 if (!found) {
1792                     here = LoopLevel(node->func, Var::outermost());
1793                 }
1794                 // internal_assert(found) << "Could not find appropriate compute_at location for children of " << node->func.name() << "\n";
1795                 state.vars.insert(state.vars.begin(), new_inner.begin(), new_inner.end());
1796             }
1797         }
1798         if (innermost) {
1799             internal_assert(store_at.empty());
1800             internal_assert(children.empty());
1801             return;
1802         }
1803 
1804         for (auto f : store_at) {
1805             Func(f->func).store_at(here);
1806         }
1807         for (auto s : size) {
1808             num_cores /= s;
1809         }
1810         here.lock();
1811         string loop_level;
1812         if (here.is_root()) {
1813             loop_level = "_root()";
1814         } else {
1815             loop_level = "_at(" + here.func() + ", " + here.var().name() + ")";
1816         }
1817         for (auto &c : children) {
1818             if (c->node != node) {
1819                 Func(c->node->func).compute_at(here);
1820             }
1821             c->apply(here, state_map, num_cores, depth + 1, this, compute_site);
1822             if (c->node != node && c->stage->index == 0) {
1823                 auto &state = *(state_map.get(c->stage));
1824                 state.schedule_source << "\n    .compute" << loop_level;
1825             }
1826         }
1827         for (auto f : store_at) {
1828             bool computed_here = false;
1829             for (auto &c : children) {
1830                 if (c->node == f) {
1831                     computed_here = true;
1832                     break;
1833                 }
1834             }
1835             if (!computed_here) {
1836                 auto &state = *(state_map.get(&(f->stages[0])));
1837                 state.schedule_source << "\n    .store" << loop_level;
1838             }
1839         }
1840     }
1841 }
1842 
1843 }  // namespace Autoscheduler
1844 }  // namespace Internal
1845 }  // namespace Halide
1846