1 #include "LoopNest.h"
2
3 using std::set;
4 using std::vector;
5
6 namespace Halide {
7 namespace Internal {
8 namespace Autoscheduler {
9
10 // How small should an innermost loop cluster be before you just
11 // entirely unroll the thing. Sized for an architecture with 16 vector
12 // registers.
13 const int kUnrollLimit = 12;
14
15 // Get the HL_NO_SUBTILING environment variable. Purpose described above.
get_may_subtile()16 bool get_may_subtile() {
17 string no_subtiling_str = get_env_variable("HL_NO_SUBTILING");
18 if (no_subtiling_str == "1") {
19 return false;
20 } else {
21 return true;
22 }
23 }
may_subtile()24 bool may_subtile() {
25 static bool b = get_may_subtile();
26 return b;
27 }
28
29 // Given a multi-dimensional box of dimensionality d, generate a list
30 // of candidate tile sizes for it, logarithmically spacing the sizes
31 // using the given factor. If 'allow_splits' is false, every dimension
32 // must either be one, or the full extent of the box. This function is
33 // used to generate candidate tilings when tiling for
34 // producer-consumer fusion, or tiling for parallelism.
generate_tilings(const vector<int64_t> & s,int d,int factor,bool allow_splits)35 vector<vector<int64_t>> generate_tilings(const vector<int64_t> &s, int d, int factor, bool allow_splits) {
36 vector<vector<int64_t>> result;
37 if (d == -1) {
38 result.push_back(vector<int64_t>());
39 } else {
40 vector<vector<int64_t>> v = generate_tilings(s, d - 1, factor, allow_splits);
41 // If we're already generated too many tiling configurations
42 // for the inner loops, search the outer loops with coarser
43 // granularity.
44 while (v.size() > (size_t)factor * 100) {
45 factor *= 2;
46 }
47
48 for (auto &t : v) {
49 bool is_full = false, is_one = false;
50 // Skip trivial tilings
51 if ((size_t)d == s.size() - 1) {
52 is_one = is_full = true;
53 for (int i = 0; i < d; i++) {
54 is_one &= (t[i] == 1);
55 is_full &= (t[i] == s[i]);
56 }
57 }
58 t.push_back(0);
59 if (!allow_splits) {
60 if (!is_one) {
61 t.back() = 1;
62 result.push_back(t);
63 }
64 if (s[d] != 1 && !is_full) {
65 t.back() = s[d];
66 result.push_back(t);
67 }
68 } else {
69 int max_inner = 0;
70 for (int inner = 1; inner < s[d]; inner *= factor) {
71 int outer = (s[d] + inner - 1) / inner;
72 if (is_one && outer == 1) continue;
73 if (is_full && outer == s[d]) continue;
74 // Stop when we hit inner sizes that would do too much recompute
75 if (inner > 1 && inner * outer * 7 > s[d] * 8) break;
76 max_inner = inner;
77 t.back() = outer;
78 result.push_back(t);
79 }
80 for (int outer = 1; outer <= s[d]; outer *= factor) {
81 int inner = (s[d] + outer - 1) / outer;
82 if (is_one && outer == 1) continue;
83 if (is_full && outer == s[d]) continue;
84 // Stop when we get into the regime covered by the loop above.
85 if (outer > 1 && inner < max_inner * 2) break;
86 // Or when the wasted compute gets too bad.
87 if (inner * outer * 7 > s[d] * 8) break;
88 t.back() = outer;
89 result.push_back(t);
90 }
91
92 // The sequence above (in terms of the inner loop)
93 // goes 1 2 4 8 16 ... but 3 is an important inner
94 // tiling factor for matrix multiply/gemm-type loops
95 // which try to use 12 vector registers.
96 int inner3 = 3;
97 int outer3 = (s[d] + inner3 - 1) / inner3;
98 if (factor == 2 && inner3 < s[d] && outer3 < s[d] && outer3 > 1) {
99 if (inner3 * outer3 * 7 <= s[d] * 8) {
100 t.back() = outer3;
101 result.push_back(t);
102 }
103 }
104 }
105 }
106 }
107 return result;
108 }
109
copy_from(const LoopNest & n)110 void LoopNest::copy_from(const LoopNest &n) {
111 size = n.size;
112 children = n.children;
113 inlined = n.inlined;
114 store_at = n.store_at;
115 bounds = n.bounds;
116 node = n.node;
117 stage = n.stage;
118 innermost = n.innermost;
119 tileable = n.tileable;
120 parallel = n.parallel;
121 vector_dim = n.vector_dim;
122 vectorized_loop_index = n.vectorized_loop_index;
123 };
124
125 // Hash the loop structure and sizes up to a fixed depth. This is
126 // used as the hash function for the coarse-to-fine beam search in
127 // the paper.
structural_hash(uint64_t & h,int depth) const128 void LoopNest::structural_hash(uint64_t &h, int depth) const {
129 if (depth < 0) return;
130
131 // Which Funcs are store_at this level?
132 for (const auto *n : store_at) {
133 hash_combine(h, n->id);
134 }
135
136 hash_combine(h, -1);
137
138 // Which Funcs are compute_at this level?
139 for (const auto &c : children) {
140 hash_combine(h, c->stage->id);
141 }
142
143 // Add a barrier to ensure that moving something from the last
144 // compute_at to the first inlined doesn't result in the same
145 // hash.
146 hash_combine(h, -1);
147
148 // Which Funcs are inlined at this level?
149 for (auto it = inlined.begin(); it != inlined.end(); it++) {
150 hash_combine(h, it.key()->id);
151 }
152
153 hash_combine(h, -1);
154
155 if (depth > 0) {
156 // What are the loop sizes of the children?
157 for (const auto &c : children) {
158 for (int64_t s : c->size) {
159 if (depth == 1) {
160 // Just take the most significant bit: is it one or not?
161 s = (s > 1) ? 1 : 0;
162 }
163 hash_combine(h, s);
164 }
165 }
166
167 // Which dimension are we vectorized over?
168 hash_combine(h, vectorized_loop_index);
169 }
170
171 if (depth > 1) {
172 // Descend into children
173 for (const auto &c : children) {
174 c->structural_hash(h, depth - 2);
175 }
176 }
177 }
178
179 // Compute all the sites of interest for each pipeline stage
get_sites(StageMap<Sites> & sites,const LoopNest * task,const LoopNest * parent) const180 void LoopNest::get_sites(StageMap<Sites> &sites,
181 const LoopNest *task,
182 const LoopNest *parent) const {
183 if (!task && !is_root()) {
184 task = this;
185 }
186 for (const auto &c : children) {
187 c->get_sites(sites, task, this);
188 }
189 if (parent && node != parent->node) {
190 auto &s = sites.get_or_create(stage);
191 s.compute = parent;
192 s.produce = this;
193 s.task = task;
194 }
195 for (auto f : store_at) {
196 for (const auto &s : f->stages) {
197 sites.get_or_create(&s).store = this;
198 }
199 }
200 for (auto it = inlined.begin(); it != inlined.end(); it++) {
201 auto &s = sites.get_or_create(&(it.key()->stages[0]));
202 s.inlined = true;
203 s.compute = s.store = s.produce = s.innermost = this;
204 s.task = task;
205 }
206 if (innermost) {
207 sites.get_or_create(stage).innermost = this;
208 }
209 }
210
211 // Do a recursive walk over the loop nest computing features to feed the cost model.
compute_features(const FunctionDAG & dag,const MachineParams & params,const StageMap<Sites> & sites,int64_t instances,int64_t parallelism,const LoopNest * parent,const LoopNest * grandparent,const LoopNest & root,int64_t * working_set,StageMap<ScheduleFeatures> * features) const212 void LoopNest::compute_features(const FunctionDAG &dag,
213 const MachineParams ¶ms,
214 const StageMap<Sites> &sites,
215 int64_t instances,
216 int64_t parallelism,
217 const LoopNest *parent,
218 const LoopNest *grandparent,
219 const LoopNest &root,
220 int64_t *working_set,
221 StageMap<ScheduleFeatures> *features) const {
222 int64_t working_set_here = 0;
223
224 int64_t loop_instances = 1, parallel_tasks = 1;
225 bool in_impure = false;
226 for (int idx = (int)size.size() - 1; idx >= 0; idx--) {
227 size_t i = size[idx];
228 loop_instances *= i;
229 if (stage->loop[idx].pure && !in_impure) {
230 if (params.parallelism > 1 &&
231 (parallel || (parent->is_root() && parallel_tasks < params.parallelism))) {
232 // Either we've picked our parallel tiling, or
233 // it's not yet determined. Assume we'll not split
234 // any loops and just stop after we hit the
235 // required number of cores
236 parallel_tasks *= i;
237 // If we haven't picked out parallel tiling yet,
238 // assume that we'll target 8*cores when we do,
239 // which is a common rule of thumb.
240 if (!parallel && parallel_tasks > params.parallelism * 8) {
241 // We would split this loop
242 parallel_tasks = params.parallelism * 8;
243 }
244 }
245 } else if (i != 1) {
246 in_impure = true;
247 }
248 }
249
250 int64_t subinstances = instances * loop_instances;
251
252 for (const auto *node : store_at) {
253 // Figure out the features at the store_at level
254 const auto &bounds = get_bounds(node);
255
256 for (size_t s = 0; s < node->stages.size(); s++) {
257 // TODO: Lift invariants from this loop. Most of it's the same for every stage.
258 internal_assert(!node->is_input);
259 ScheduleFeatures &feat = features->get_or_create(&(node->stages[s]));
260
261 feat.num_realizations = subinstances;
262
263 feat.points_computed_per_realization = 1;
264 feat.num_scalars = feat.num_vectors = subinstances;
265 bool vectorized = false;
266 for (int i = 0; i < (int)node->stages[s].loop.size(); i++) {
267 const auto &p = bounds->loops(s, i);
268 int64_t extent = p.extent();
269 feat.points_computed_per_realization *= extent;
270 if (i == sites.get(&(node->stages[s])).produce->vectorized_loop_index) {
271 // Assumes that we're not going to split
272 // things such that non-native-width
273 // vectorization is a problem, except for the
274 // tail.
275 feat.num_vectors *= extent / node->stages[s].vector_size;
276 feat.num_scalars *= extent % node->stages[s].vector_size;
277 vectorized = true;
278 } else {
279 feat.num_vectors *= extent;
280 feat.num_scalars *= extent;
281 }
282 }
283 if (!vectorized) {
284 feat.num_vectors = 0;
285 }
286 feat.points_computed_total = feat.points_computed_per_realization * feat.num_realizations;
287
288 feat.bytes_at_realization = node->bytes_per_point;
289 for (int i = 0; i < node->dimensions; i++) {
290 const auto &p = bounds->region_computed(i);
291 feat.bytes_at_realization *= p.extent();
292 }
293 int64_t innermost_storage_extent = 1;
294 int v = sites.get(&(node->stages[s])).produce->vector_dim;
295 if (v >= 0 && node->dimensions > 0) {
296 innermost_storage_extent = bounds->region_computed(v).extent();
297 }
298 feat.innermost_bytes_at_realization = node->bytes_per_point * innermost_storage_extent;
299
300 if (!is_root()) {
301 feat.bytes_at_task = feat.bytes_at_realization;
302 feat.innermost_bytes_at_task = feat.innermost_bytes_at_realization;
303 }
304 }
305 }
306
307 if (is_root()) {
308 // TODO: This block of code is repeated below. Refactor
309 for (const auto &c : children) {
310 c->compute_features(dag, params, sites, subinstances, parallelism, this, parent, root, &working_set_here, features);
311 }
312
313 for (const auto *node : store_at) {
314 auto &feat = features->get(&(node->stages[0]));
315 working_set_here += feat.bytes_at_production;
316 }
317 for (const auto *node : store_at) {
318 for (const auto &s : node->stages) {
319 auto &feat = features->get(&s);
320 feat.working_set_at_realization = working_set_here;
321 }
322 }
323 for (const auto &c : children) {
324 if (c->node != node) {
325 auto &feat = features->get(c->stage);
326 feat.working_set_at_production = working_set_here;
327 }
328 }
329
330 // Figure out the root-level features for every Func
331 for (auto it = features->begin(); it != features->end(); it++) {
332 const auto *stage = it.key();
333 const auto *node = stage->node;
334 auto &feat = it.value();
335 const auto &root_bounds = root.get_bounds(node);
336
337 feat.bytes_at_root = node->bytes_per_point;
338 for (int i = 0; i < node->dimensions; i++) {
339 const auto &p = root_bounds->region_computed(i);
340 feat.bytes_at_root *= p.extent();
341 }
342
343 feat.working_set_at_root = working_set_here;
344
345 auto *p = sites.get(stage).produce;
346 if (p) {
347 // Extent of the innermost dimension in the storage layout
348 int64_t innermost_storage_extent = 1;
349 int v = p->vector_dim;
350 if (v >= 0 && v < node->dimensions) {
351 innermost_storage_extent = root_bounds->region_computed(v).extent();
352 }
353 feat.innermost_bytes_at_root = node->bytes_per_point * innermost_storage_extent;
354 } else {
355 feat.innermost_bytes_at_root = 0;
356 }
357
358 feat.points_computed_minimum = 1;
359 for (int i = 0; i < (int)stage->loop.size(); i++) {
360 const auto &p = root_bounds->loops(stage->index, i);
361 feat.points_computed_minimum *= p.extent();
362 }
363
364 if (node->stages.size() == 1 && !node->is_output) {
365 int64_t points_computed_minimum_if_inlined = 0;
366 for (auto *e : node->outgoing_edges) {
367 points_computed_minimum_if_inlined += features->get(e->consumer).points_computed_minimum * e->calls;
368 }
369 feat.points_computed_minimum = std::min(feat.points_computed_minimum, (double)points_computed_minimum_if_inlined);
370 }
371 }
372
373 return;
374 }
375
376 int64_t subparallelism = parallel_tasks * parallelism;
377
378 // Figure out the features at the compute_at level
379 internal_assert(!stage->node->is_input);
380 ScheduleFeatures &feat = features->get_or_create(stage);
381
382 if (innermost) {
383 if (vectorized_loop_index >= 0 && vectorized_loop_index < (int)size.size()) {
384 feat.vector_size = size[vectorized_loop_index];
385 } else {
386 feat.vector_size = 1;
387 }
388 if (feat.vector_size == 1) {
389 // They're all scalars
390 feat.num_scalars += feat.num_vectors;
391 feat.num_vectors = 0;
392 }
393 } else {
394 // We want these features just outside the innermost loop,
395 // so just set them at every level and let them get
396 // progressively overwritten as we descend the loop nest
397 // tree.
398 size_t idx = 0;
399 feat.innermost_loop_extent = 1;
400 feat.innermost_pure_loop_extent = 1;
401 for (const auto &l : stage->loop) {
402 feat.innermost_loop_extent *= size[idx];
403 if (!l.rvar) {
404 feat.innermost_pure_loop_extent *= size[idx];
405 }
406 idx++;
407 }
408 }
409
410 const bool at_task = parent->is_root();
411 const bool at_production = parent->node != node;
412 const bool at_pure_production = at_production && stage->index == 0;
413
414 if (at_task) {
415 if (parallel) {
416 const auto &bounds = get_bounds(node);
417 feat.bytes_at_task = node->bytes_per_point;
418 int64_t innermost_storage_extent = 1;
419 for (int i = 0; i < node->dimensions; i++) {
420 int64_t outer = 1;
421 for (size_t l = 0; l < stage->loop.size(); l++) {
422 if (stage->loop[l].var == node->func.args()[i]) {
423 outer = size[l];
424 break;
425 }
426 }
427 const auto &p = bounds->region_computed(i);
428 int64_t extent = p.extent();
429 extent /= outer;
430 feat.bytes_at_task *= extent;
431 if (i == vector_dim) {
432 innermost_storage_extent = extent;
433 }
434 }
435 feat.innermost_bytes_at_task = node->bytes_per_point * innermost_storage_extent;
436 } else {
437 // How this loop will be parallelized is not yet
438 // determined. Use optimistic values for the features.
439 feat.bytes_at_task = (feat.bytes_at_realization + params.parallelism - 1) / params.parallelism;
440 feat.innermost_bytes_at_task = std::min(feat.bytes_at_task, feat.innermost_bytes_at_realization);
441 }
442
443 feat.unique_bytes_read_per_task = 0;
444 feat.unique_lines_read_per_task = 0;
445
446 // We're at a parallel for loop. Check all the accesses
447 // done by Funcs inside this loop to values computed
448 // outside of it to figure out how much data we'll be
449 // streaming onto the core.
450 vector<const FunctionDAG::Edge *> pending;
451 set<const FunctionDAG::Node *> done;
452 for (const auto *e : stage->incoming_edges) {
453 pending.push_back(e);
454 }
455 while (!pending.empty()) {
456 const auto *e = pending.back();
457 pending.pop_back();
458 if (done.count(e->producer)) continue;
459 done.insert(e->producer);
460 const auto &site = sites.get(&(e->producer->stages[0]));
461 if (site.store->is_root()) {
462 const auto &b = get_bounds(e->producer);
463 int64_t bytes = e->producer->bytes_per_point, lines = 1;
464 int64_t max_extent = 1;
465 // clang-format off
466 int vector_dim = (e->producer->is_input ? 0 :
467 site.produce != nullptr ? site.produce->vector_dim :
468 -1);
469 // clang-format on
470 for (int i = 0; i < e->producer->dimensions; i++) {
471 int64_t extent = b->region_required(i).extent();
472 max_extent = std::max(extent, max_extent);
473 bytes *= extent;
474 if (i != vector_dim) {
475 lines *= extent;
476 }
477 }
478 if (!e->producer->is_input && site.produce == nullptr) {
479 // We haven't scheduled the producer so we
480 // don't know the memory layout yet. Assume
481 // the best case.
482 lines /= max_extent;
483 }
484 feat.unique_bytes_read_per_task += bytes;
485 feat.unique_lines_read_per_task += lines;
486
487 } else if (site.produce != nullptr) {
488 // Computation must be nested inside this task or inlined into it.
489 for (const auto &s : e->producer->stages) {
490 for (const auto *e2 : s.incoming_edges) {
491 pending.push_back(e2);
492 }
493 }
494 }
495 }
496 }
497
498 if (at_production) {
499 feat.num_productions = instances;
500 feat.inner_parallelism = parallel_tasks;
501 feat.outer_parallelism = parallelism;
502 feat.native_vector_size = stage->vector_size;
503
504 const auto &bounds = parent->get_bounds(node);
505
506 feat.bytes_at_production = node->bytes_per_point;
507 for (int i = 0; i < node->dimensions; i++) {
508 const auto &p = bounds->region_computed(i);
509 feat.bytes_at_production *= p.extent();
510 }
511 int64_t innermost_storage_extent = 1;
512 if (vector_dim >= 0 && node->dimensions > 0) {
513 innermost_storage_extent = bounds->region_computed(vector_dim).extent();
514 }
515 feat.innermost_bytes_at_production = node->bytes_per_point * innermost_storage_extent;
516 }
517
518 // Recurse inwards
519 for (const auto &c : children) {
520 c->compute_features(dag, params, sites, subinstances, subparallelism, this, parent, root, &working_set_here, features);
521 }
522 for (const auto *node : store_at) {
523 auto &feat = features->get(&(node->stages[0]));
524 working_set_here += feat.bytes_at_production;
525 }
526 for (const auto *node : store_at) {
527 for (const auto &s : node->stages) {
528 auto &feat = features->get(&s);
529 feat.working_set_at_realization = working_set_here;
530 }
531 }
532 for (const auto &c : children) {
533 if (c->node != node) {
534 auto &feat = features->get(c->stage);
535 feat.working_set_at_production = working_set_here;
536 }
537 }
538
539 if (at_task) {
540 set_working_set_at_task_feature(working_set_here, features);
541 }
542
543 if (at_production) {
544 feat.working_set = working_set_here;
545 }
546
547 if (innermost) {
548 bool parent_unrolled =
549 (feat.innermost_pure_loop_extent <= kUnrollLimit &&
550 parent->node == node);
551
552 if (parent_unrolled) {
553 const auto &grandparent_bounds = grandparent->get_bounds(node);
554 for (size_t i = 0; i < parent->size.size(); i++) {
555 if (!stage->loop[i].rvar) {
556 const auto &l = grandparent_bounds->loops(parent->stage->index, i);
557 parent_unrolled &= l.constant_extent();
558 }
559 }
560 }
561
562 if (parent_unrolled) {
563 feat.unrolled_loop_extent = feat.innermost_pure_loop_extent;
564 } else {
565 feat.unrolled_loop_extent = 1;
566 }
567 }
568
569 *working_set += working_set_here;
570
571 // Analyze all memory dependencies of this stage, looking
572 // through any Funcs inlined into it. This is where we track
573 // things like vector gathers.
574 int64_t bytes_loaded = 0, lines_loaded = 0, allocation_bytes_loaded = 0;
575 double num_dense_loads = 0, num_broadcasts = 0,
576 num_gathers = 0, num_stride_2_loads = 0,
577 num_stride_3_loads = 0, num_stride_4_loads = 0,
578 num_loads = 0;
579 if (innermost || at_production) { // These are the sites at which we compute load footprints
580 // Pick the site at which we will compute the footprint relationship
581 const auto &consumer_site = sites.get(stage);
582
583 // The store_at location of the consumer
584 const auto *consumer_store_site = innermost ? parent : consumer_site.store;
585
586 // The parallel loop of the consumer
587 const auto *consumer_task_site = consumer_site.task;
588
589 int64_t consumer_instances = innermost ? instances : feat.num_realizations;
590 internal_assert(consumer_instances != 0);
591
592 vector<const FunctionDAG::Node::Stage *> pending;
593 pending.emplace_back(stage);
594 vector<pair<LoadJacobian, FunctionDAG::Node *>> jacobians;
595 set<const FunctionDAG::Node *> done;
596 while (!pending.empty()) {
597 auto p = pending.back();
598 pending.pop_back();
599 const auto &next_edges = p->incoming_edges;
600 for (const auto *e : next_edges) {
601 internal_assert(sites.contains(&(e->producer->stages[0])))
602 << "No site found for " << e->producer->func.name() << "\n";
603
604 const auto &site = sites.get(&(e->producer->stages[0]));
605
606 bool producer_has_been_scheduled = e->producer->is_input || (site.produce != nullptr);
607
608 if (innermost) {
609 if (e->consumer == stage) {
610 for (auto &j : e->load_jacobians) {
611 jacobians.emplace_back(j, e->producer);
612 }
613 } else {
614 // Consumer was inlined. Multiply the Jacobians to look through it.
615 decltype(jacobians) new_jacobians;
616 for (auto &j1 : jacobians) {
617 if (e->consumer->node == j1.second) {
618 for (auto &j2 : e->load_jacobians) {
619 LoadJacobian j = j2 * j1.first;
620 new_jacobians.emplace_back(j, e->producer);
621 }
622 } else {
623 new_jacobians.emplace_back(std::move(j1));
624 }
625 }
626 jacobians.swap(new_jacobians);
627 }
628 }
629
630 if (site.inlined) {
631 // Recursively examine the inputs
632 pending.emplace_back(&(e->producer->stages[0]));
633 continue;
634 }
635
636 // The producer's compute_at site
637 const auto *producer_compute_site = site.compute;
638
639 // The producer's store_at site
640 const auto *producer_store_site = site.store;
641
642 // The region required of the producer at various sites.
643 const auto &bounds = consumer_store_site->get_bounds(e->producer);
644 const auto &task_bounds = consumer_task_site->get_bounds(e->producer);
645 const auto &producer_compute_bounds = producer_compute_site->get_bounds(e->producer);
646 const auto &producer_store_bounds = producer_store_site->get_bounds(e->producer);
647
648 // Compute memory footprints in terms of the
649 // number of contiguous lines, and the number of
650 // bytes.
651 int64_t footprint = e->producer->bytes_per_point;
652 int64_t compute_footprint = footprint;
653 int64_t store_footprint = footprint;
654 int64_t task_footprint = footprint;
655 int64_t line_footprint = 1;
656 int64_t compute_line_footprint = 1;
657 int64_t store_line_footprint = 1;
658 int64_t task_line_footprint = 1;
659
660 if (e->producer->is_input) {
661 // This node represents an input. Its sites
662 // should be at the root level.
663 internal_assert(producer_store_site->is_root());
664 internal_assert(producer_compute_site->is_root());
665 }
666
667 if (innermost) {
668
669 // Grab the Jacobians that describe the memory dependence
670 for (const auto &jac : jacobians) {
671 if (jac.second != e->producer) continue;
672 double n = jac.first.count();
673
674 // Classify them to figure out what's going on in the vector dimension.
675 bool vector_broadcast = true;
676 bool dense_vector_load = true;
677 bool stride_2_vector_load = true;
678 bool stride_3_vector_load = true;
679 bool stride_4_vector_load = true;
680 int producer_innermost_dim =
681 (e->producer->is_input ? 0 : // Assume default storage layout for inputs
682 !producer_has_been_scheduled ? -1 :
683 site.produce->vector_dim);
684 if (vectorized_loop_index >= 0) {
685 if (!producer_has_been_scheduled) {
686 // Operate optimistically and just
687 // see if *any* dimension of the
688 // producer would make for a good
689 // load.
690 int count[5] = {0, 0, 0, 0, 0};
691 for (int i = 0; i < e->producer->dimensions; i++) {
692 auto stride = jac.first(i, vectorized_loop_index);
693 // stride is a rational. Check to see if it's a small integer.
694 if (stride == 0) {
695 count[0]++;
696 } else if (stride == 1) {
697 count[1]++;
698 } else if (stride == 2) {
699 count[2]++;
700 } else if (stride == 3) {
701 count[3]++;
702 } else if (stride == 4) {
703 count[4]++;
704 }
705 }
706 vector_broadcast = (count[0] == e->producer->dimensions);
707 dense_vector_load = (count[0] == e->producer->dimensions - 1 && count[1] == 1);
708 stride_2_vector_load = (count[0] == e->producer->dimensions - 1 && count[2] == 1);
709 stride_3_vector_load = (count[0] == e->producer->dimensions - 1 && count[3] == 1);
710 stride_4_vector_load = (count[0] == e->producer->dimensions - 1 && count[4] == 1);
711 } else {
712 for (int i = 0; i < e->producer->dimensions; i++) {
713 auto stride = jac.first(i, vectorized_loop_index);
714 vector_broadcast &= stride == 0;
715 if (i == producer_innermost_dim) {
716 dense_vector_load &= stride == 1;
717 stride_2_vector_load &= stride == 2;
718 stride_3_vector_load &= stride == 3;
719 stride_4_vector_load &= stride == 4;
720 } else {
721 dense_vector_load &= stride == 0;
722 stride_2_vector_load &= stride == 0;
723 stride_3_vector_load &= stride == 0;
724 stride_4_vector_load &= stride == 0;
725 // TODO: Check for strided
726 // loads across non-innermost
727 // dims, and use to count the
728 // number of pages, cache
729 // lines, cache conflict misses, etc.
730 }
731 }
732 }
733 }
734
735 // Is this load loop-invariant over an
736 // unrolled block? If so, we amortize the
737 // number of loads to account for
738 // LICM. This is the key performance
739 // optimization you get from unrolling the
740 // inner loop of a gemm or conv, so it's
741 // important to capture it in the
742 // featurization.
743 int64_t amortization = 1;
744 if (feat.unrolled_loop_extent > 1) {
745 for (size_t idx = 0; idx < stage->loop.size(); idx++) {
746 if (!stage->loop[idx].rvar) {
747 bool loop_invariant = true;
748 for (int i = 0; i < e->producer->dimensions; i++) {
749 if (!(jac.first(i, idx) == 0)) {
750 loop_invariant = false;
751 break;
752 }
753 }
754 if (loop_invariant) {
755 amortization *= parent->size[idx];
756 }
757 }
758 }
759 }
760 n /= amortization;
761
762 num_loads += n;
763 if (vector_broadcast) {
764 num_broadcasts += n;
765 } else if (dense_vector_load) {
766 num_dense_loads += n;
767 } else if (stride_2_vector_load) {
768 num_stride_2_loads += n;
769 } else if (stride_3_vector_load) {
770 num_stride_3_loads += n;
771 } else if (stride_4_vector_load) {
772 num_stride_4_loads += n;
773 } else {
774 num_gathers += n;
775 }
776 }
777 }
778
779 // Already dealt with the footprints for this producer via some other path
780 if (done.find(e->producer) != done.end()) {
781 continue;
782 }
783
784 done.insert(e->producer);
785
786 // Now look at the shapes of the regions read from
787 // the producer at various sites.
788 int64_t max_extent = 1, max_compute_extent = 1, max_store_extent = 1, max_task_extent = 1;
789 for (int i = 0; i < e->producer->dimensions; i++) {
790 auto p = bounds->region_required(i);
791 auto compute_p = producer_compute_bounds->region_computed(i);
792 auto store_p = producer_store_bounds->region_required(i);
793 auto task_p = task_bounds->region_required(i);
794
795 // Check some invariants
796 internal_assert(store_p.min() <= store_p.max()) << store_p.min() << " " << store_p.max() << "\n";
797 internal_assert(compute_p.min() <= compute_p.max()) << compute_p.min() << " " << compute_p.max() << "\n";
798 internal_assert(task_p.min() <= task_p.max()) << task_p.min() << " " << task_p.max() << "\n";
799
800 int64_t extent = p.extent();
801 int64_t compute_extent = compute_p.extent();
802 int64_t store_extent = store_p.extent();
803 int64_t task_extent = task_p.extent();
804
805 max_extent = std::max(extent, max_extent);
806 max_compute_extent = std::max(compute_extent, max_compute_extent);
807 max_store_extent = std::max(store_extent, max_store_extent);
808 max_task_extent = std::max(task_extent, max_task_extent);
809
810 footprint *= extent;
811 compute_footprint *= compute_extent;
812 store_footprint *= store_extent;
813 task_footprint *= task_extent;
814
815 bool dense = ((e->producer->is_input && i == 0) ||
816 (site.produce != nullptr && i == site.produce->vector_dim));
817 if (!dense) {
818 line_footprint *= extent;
819 compute_line_footprint *= compute_extent;
820 store_line_footprint *= store_extent;
821 task_line_footprint *= task_extent;
822 }
823 }
824
825 if (!producer_has_been_scheduled) {
826 // Optimistically assume it gets vectorized
827 // along whatever dimension makes these
828 // numbers the smallest.
829 line_footprint /= max_extent;
830 compute_line_footprint /= max_compute_extent;
831 store_line_footprint /= max_store_extent;
832 task_line_footprint /= max_task_extent;
833 }
834
835 int64_t store_instances_per_consumption = 1;
836
837 if (producer_has_been_scheduled && !e->producer->is_input) {
838 const auto &producer_feat = features->get_or_create(&(e->producer->stages[0]));
839
840 if (producer_feat.num_realizations) {
841 // The producer's realization is nested inside this Func's realization
842 const int64_t producer_store_instances = producer_feat.num_realizations;
843 if (producer_store_instances > consumer_instances) {
844 store_instances_per_consumption = producer_store_instances / consumer_instances;
845 }
846 }
847 }
848
849 allocation_bytes_loaded += compute_footprint;
850
851 if (store_instances_per_consumption > 1) {
852 // The producer is nested inside the consumer
853 bytes_loaded += store_footprint;
854 // Due to folding, the actual buffer size is smaller than the bounds at the store level
855 lines_loaded += store_line_footprint;
856 } else {
857 // The consumer is consuming some portion of a larger producer computed earlier
858 bytes_loaded += footprint;
859 lines_loaded += line_footprint;
860 }
861
862 // We compute (but never use) these; computing them is cheap,
863 // so let's leave in for future reference, but mark as 'ignore me'
864 // to avoid clang-tidy warnings.
865 (void)compute_line_footprint;
866 (void)task_line_footprint;
867 }
868 }
869 }
870
871 if (at_production) {
872 // Properties of the realization, but the values are
873 // computable at the production site because that's where
874 // the consumers are.
875 internal_assert(bytes_loaded >= 0) << "Negative bytes loaded: " << bytes_loaded << "\n";
876 feat.allocation_bytes_read_per_realization = allocation_bytes_loaded;
877 feat.unique_bytes_read_per_realization = bytes_loaded;
878 feat.unique_lines_read_per_realization = lines_loaded;
879
880 if (!at_pure_production) {
881 // Also pessimistically assume this update definition relies on the entirety of the produced region so far.
882 // TODO: This overbills scatters, or writes to a sub-window.
883 internal_assert(bytes_loaded >= 0) << "Negative bytes at production: " << feat.bytes_at_production << "\n";
884 feat.unique_bytes_read_per_realization += feat.bytes_at_production;
885 feat.unique_lines_read_per_realization += feat.bytes_at_production / feat.innermost_bytes_at_production;
886 feat.allocation_bytes_read_per_realization += feat.bytes_at_production;
887 }
888 }
889
890 if (innermost) {
891 feat.points_computed_per_production = subinstances / feat.num_productions;
892 // Halide codegens strided loads for small strides as a
893 // large dense vector load and a cheap swizzle. ARM even
894 // has instructions that do this for free on load
895 // (e.g. vld4).
896 feat.vector_loads_per_vector = (num_dense_loads +
897 2 * num_stride_2_loads +
898 3 * num_stride_3_loads +
899 4 * num_stride_4_loads);
900 feat.scalar_loads_per_vector = num_broadcasts + feat.vector_size * num_gathers;
901 feat.scalar_loads_per_scalar = num_loads;
902 if (stage->index > 0) {
903 // Assume at update definitions we do a self-load
904 feat.vector_loads_per_vector++;
905 feat.scalar_loads_per_scalar++;
906 }
907 feat.unique_bytes_read_per_vector = bytes_loaded;
908 feat.unique_lines_read_per_vector = lines_loaded;
909 }
910
911 // Track features for inlined Funcs
912 for (auto it = inlined.begin(); it != inlined.end(); it++) {
913 const auto *f = it.key();
914 internal_assert(f);
915 auto &inlined_feat = features->get_or_create(&(f->stages[0]));
916 inlined_feat.inlined_calls += it.value() * subinstances;
917 inlined_feat.num_vectors += it.value() * feat.num_vectors;
918 inlined_feat.num_scalars += it.value() * feat.num_scalars;
919 inlined_feat.native_vector_size = stage->vector_size;
920 if (inlined_feat.vector_size > 0) {
921 inlined_feat.vector_size = std::min(inlined_feat.vector_size, (double)stage->vector_size);
922 } else {
923 inlined_feat.vector_size = feat.vector_size;
924 }
925 if (inlined_feat.innermost_pure_loop_extent > 0) {
926 inlined_feat.innermost_pure_loop_extent =
927 std::min(inlined_feat.innermost_pure_loop_extent,
928 feat.innermost_pure_loop_extent);
929 } else {
930 inlined_feat.innermost_pure_loop_extent = feat.innermost_pure_loop_extent;
931 }
932 inlined_feat.inner_parallelism = 1;
933 inlined_feat.outer_parallelism = parallelism;
934 }
935 }
936
937 // Get the region required of a Func at this site, from which we
938 // know what region would be computed if it were scheduled here,
939 // and what its loop nest would be.
get_bounds(const FunctionDAG::Node * f) const940 const Bound &LoopNest::get_bounds(const FunctionDAG::Node *f) const {
941 if (bounds.contains(f)) {
942 const Bound &b = bounds.get(f);
943 // Expensive validation for debugging
944 // b->validate();
945 return b;
946 }
947 auto bound = f->make_bound();
948
949 // Compute the region required
950 if (f->is_output && is_root()) {
951 internal_assert(f->outgoing_edges.empty()) << "Outputs that access other outputs not yet supported\n";
952 // It's an output. Use the bounds estimate.
953 for (int i = 0; i < f->dimensions; i++) {
954 bound->region_required(i) = f->estimated_region_required[i];
955 }
956 } else {
957 internal_assert(!f->outgoing_edges.empty())
958 << "No consumers of " << f->func.name()
959 << " at loop over " << (is_root() ? "root" : node->func.name()) << "\n";
960 auto init = Span::empty_span();
961 for (int i = 0; i < f->dimensions; i++) {
962 bound->region_required(i) = init;
963 }
964
965 for (const auto *e : f->outgoing_edges) {
966 // Ignore consumers outside of this loop nest
967 if (!is_root() &&
968 (stage != e->consumer) &&
969 !stage->downstream_of(*(e->consumer->node))) {
970 continue;
971 }
972 const auto &c_bounds = get_bounds(e->consumer->node);
973
974 // Get the concrete sizes of the consuming loop
975 const auto *consumer_loop = &(c_bounds->loops(e->consumer->index, 0));
976
977 // Use the bounds relationship between the nodes to
978 // map from the consumer's loop to the required region
979 // of the producer.
980 e->expand_footprint(consumer_loop, &(bound->region_required(0)));
981 }
982 }
983
984 // Given a required region of this producer, use the bounds
985 // analysis to figure out what region actually gets
986 // computed. For most funcs, these are the same. Some things,
987 // like histograms or scans, you can only really compute all
988 // of at once.
989 f->required_to_computed(&(bound->region_required(0)), &(bound->region_computed(0)));
990
991 // Finally, figure out what loop nests will be used to compute
992 // this region.
993 for (int i = 0; i < (int)f->stages.size(); i++) {
994 f->loop_nest_for_region(i, &(bound->region_computed(0)), &(bound->loops(i, 0)));
995 }
996
997 const Bound &b = set_bounds(f, bound);
998 // Validation is expensive, turn if off by default.
999 // b->validate();
1000 return b;
1001 }
1002
1003 // Recursively print a loop nest representation to stderr
dump(string prefix,const LoopNest * parent) const1004 void LoopNest::dump(string prefix, const LoopNest *parent) const {
1005 if (!is_root()) {
1006 // Non-root nodes always have parents.
1007 internal_assert(parent != nullptr);
1008
1009 aslog(0) << prefix << node->func.name();
1010 prefix += " ";
1011
1012 for (size_t i = 0; i < size.size(); i++) {
1013 aslog(0) << " " << size[i];
1014 // The vectorized loop gets a 'v' suffix
1015 if (innermost && i == (size_t)vectorized_loop_index) {
1016 aslog(0) << "v";
1017 }
1018 // Loops that have a known constant size get a
1019 // 'c'. Useful for knowing what we can unroll.
1020 if (parent->get_bounds(node)->loops(stage->index, i).constant_extent()) {
1021 aslog(0) << "c";
1022 }
1023 }
1024
1025 // Uncomment when debugging the representative loop bounds selected.
1026 /*
1027 const auto &bounds = get_bounds(node);
1028 for (size_t i = 0; i < size.size(); i++) {
1029 const auto &p = bounds->loops(stage->index, i);
1030 aslog(0) << " [" << p.first << ", " << p.second << "]";
1031 }
1032 */
1033
1034 aslog(0) << " (" << vectorized_loop_index << ", " << vector_dim << ")";
1035 }
1036
1037 if (tileable) {
1038 aslog(0) << " t";
1039 }
1040 if (innermost) {
1041 aslog(0) << " *\n";
1042 } else if (parallel) {
1043 aslog(0) << " p\n";
1044 } else {
1045 aslog(0) << "\n";
1046 }
1047 for (auto p : store_at) {
1048 aslog(0) << prefix << "realize: " << p->func.name() << "\n";
1049 }
1050 for (size_t i = children.size(); i > 0; i--) {
1051 children[i - 1]->dump(prefix, this);
1052 }
1053 for (auto it = inlined.begin(); it != inlined.end(); it++) {
1054 aslog(0) << prefix << "inlined: " << it.key()->func.name() << " " << it.value() << "\n";
1055 }
1056 }
1057
1058 // Does this loop nest access the given Func
calls(const FunctionDAG::Node * f) const1059 bool LoopNest::calls(const FunctionDAG::Node *f) const {
1060 for (const auto &c : children) {
1061 if (c->calls(f)) return true;
1062 }
1063 for (const auto *e : f->outgoing_edges) {
1064 if (e->consumer == stage) {
1065 return true;
1066 }
1067 if (inlined.contains(e->consumer->node)) {
1068 return true;
1069 }
1070 }
1071 return false;
1072 }
1073
1074 // What is the maximum number of inlined calls to a Func that
1075 // occur within this loop. Used to prune states that would
1076 // generate too much code.
max_inlined_calls() const1077 int64_t LoopNest::max_inlined_calls() const {
1078 int64_t result = 0;
1079 for (auto it = inlined.begin(); it != inlined.end(); it++) {
1080 result = std::max(result, it.value());
1081 }
1082 for (const auto &c : children) {
1083 result = std::max(result, c->max_inlined_calls());
1084 }
1085 return result;
1086 }
1087
1088 // Does this loop nest access an input buffer? Used to select
1089 // trail strategies when splitting loops. We don't want to read
1090 // out of bounds on inputs, even if we don't intend to use the
1091 // values read. It could create annoying assertion failures for
1092 // the user. It's OK to read out of range of the values computed
1093 // on internal Funcs though. Allocation bounds inference just pads
1094 // out the bounds so that it won't fault.
accesses_input_buffer() const1095 bool LoopNest::accesses_input_buffer() const {
1096 for (const auto &c : children) {
1097 if (c->accesses_input_buffer()) return true;
1098 }
1099 if (is_root()) return false;
1100
1101 auto check = [&](const FunctionDAG::Node::Stage *s) {
1102 for (const auto *e : s->incoming_edges) {
1103 if (e->producer->is_input) return true;
1104 }
1105
1106 for (int t = 0; t < (int)PipelineFeatures::ScalarType::NumScalarTypes; t++) {
1107 if (s->features.op_histogram[(int)PipelineFeatures::OpType::ImageCall][t] > 0) return true;
1108 }
1109 return false;
1110 };
1111
1112 if (check(stage)) return true;
1113 for (auto it = inlined.begin(); it != inlined.end(); it++) {
1114 if (check(&(it.key()->stages[0]))) return true;
1115 }
1116 return false;
1117 }
1118
1119 // Does this loop nest contain a computation of the given Func.
computes(const FunctionDAG::Node * f) const1120 bool LoopNest::computes(const FunctionDAG::Node *f) const {
1121 if (f == node) {
1122 return true;
1123 }
1124 if (inlined.contains(f)) {
1125 return true;
1126 }
1127 for (const auto &c : children) {
1128 if (c->computes(f)) return true;
1129 }
1130 return false;
1131 }
1132
1133 // Above here most methods query the loop nest. Below we have
1134 // methods that mutate the loop nest.
1135
1136 // Inline a Func into all consumers within this loop.
inline_func(const FunctionDAG::Node * f)1137 void LoopNest::inline_func(const FunctionDAG::Node *f) {
1138 // Inline it into the children
1139 for (size_t i = 0; i < children.size(); i++) {
1140 if (children[i]->calls(f)) {
1141 std::unique_ptr<LoopNest> new_child{new LoopNest};
1142 new_child->copy_from(*children[i]);
1143 new_child->inline_func(f);
1144 children[i] = new_child.release();
1145 }
1146 }
1147
1148 // Inline it here if there are any direct calls
1149 if (innermost) {
1150 int64_t calls = 0;
1151 for (const auto *e : f->outgoing_edges) {
1152 if (inlined.contains(e->consumer->node)) {
1153 calls += inlined.get(e->consumer->node) * e->calls;
1154 }
1155 if (e->consumer == stage) {
1156 calls += e->calls;
1157 }
1158 }
1159 if (calls) {
1160 inlined.insert(f, calls);
1161 }
1162 }
1163 }
1164
1165 // Compute a Func at this site.
compute_here(const FunctionDAG::Node * f,bool tileable,int v)1166 void LoopNest::compute_here(const FunctionDAG::Node *f, bool tileable, int v) {
1167 const auto &bounds = get_bounds(f);
1168
1169 if (!may_subtile()) {
1170 // If we are restricting ourselves to the Mullapudi et al
1171 // scheduling space, then once something is computed here
1172 // we may not subtile this loop.
1173 this->tileable = false;
1174 }
1175
1176 for (int s = (int)f->stages.size() - 1; s >= 0; s--) {
1177 LoopNest *node = new LoopNest;
1178 node->node = f;
1179 node->stage = &f->stages[s];
1180 node->innermost = true;
1181 node->vectorized_loop_index = -1;
1182 node->tileable = tileable && (is_root() || may_subtile());
1183 // Set up a bound for the inside of the
1184 // loop. computed/required is still the full region, but
1185 // the loop nest will be a single representative point.
1186 auto single_point = bounds->make_copy();
1187 size_t loop_dim = f->stages[s].loop.size();
1188 node->size.resize(loop_dim);
1189
1190 int64_t total_extent = 1;
1191 int64_t vector_size = 1;
1192 for (size_t i = 0; i < loop_dim; i++) {
1193 const auto &l = bounds->loops(s, i);
1194 // Initialize the loop nest
1195 node->size[i] = l.extent();
1196 total_extent *= node->size[i];
1197
1198 // Use the first loop iteration to represent the inner
1199 // loop. We'll shift it to a later one once we decide
1200 // on vectorization.
1201 single_point->loops(s, i) = Span(l.min(), l.min(), true);
1202
1203 internal_assert(l.max() >= l.min()) << i << " " << l.max() << " " << l.min() << "\n";
1204
1205 if (f->dimensions &&
1206 node->size[i] >= 1 &&
1207 f->stages[s].loop[i].var == f->func.args()[v]) {
1208 node->vectorized_loop_index = (int)i;
1209 vector_size = (int64_t)(node->stage->vector_size);
1210 single_point->loops(s, i).set_extent(vector_size);
1211 node->size[i] += vector_size - 1;
1212 node->size[i] /= vector_size;
1213
1214 // Shift the loops along by some multiple of the
1215 // vector size, to pick a more representative vector
1216 // than the first. We use the middle-most.
1217 int64_t shift = vector_size * (node->size[i] / 2);
1218 single_point->loops(s, i).translate(shift);
1219 } else {
1220 int64_t shift = node->size[i] / 2;
1221 single_point->loops(s, i).translate(shift);
1222 }
1223 }
1224
1225 // Leave region required blank inside the computation of a Func
1226 node->set_bounds(f, std::move(single_point));
1227 node->vector_dim = v;
1228
1229 if (node->vectorized_loop_index >= 0) {
1230 // Split off the single vector as an inner loop nest.
1231 node->innermost = false;
1232
1233 LoopNest *one_vector = new LoopNest;
1234 one_vector->node = node->node;
1235 one_vector->stage = node->stage;
1236 one_vector->tileable = false;
1237 one_vector->vectorized_loop_index = node->vectorized_loop_index;
1238 one_vector->vector_dim = v;
1239 one_vector->size.resize(loop_dim, 1);
1240 one_vector->innermost = true;
1241 auto b = node->get_bounds(f)->make_copy();
1242 // Set the region computed inside this node to be the first vector lane
1243 b->loops(s, node->vectorized_loop_index).set_extent(1);
1244 one_vector->set_bounds(f, b);
1245 one_vector->size[node->vectorized_loop_index] = vector_size;
1246
1247 node->children.emplace_back(one_vector);
1248 }
1249 children.emplace_back(node);
1250 }
1251 }
1252
1253 // Parallelize this loop according to the given tiling.
parallelize_in_tiles(const MachineParams & params,const vector<int64_t> & tiling,const LoopNest * parent) const1254 IntrusivePtr<const LoopNest> LoopNest::parallelize_in_tiles(const MachineParams ¶ms,
1255 const vector<int64_t> &tiling,
1256 const LoopNest *parent) const {
1257
1258 // Split this loop and move factors to the inner loop
1259 LoopNest *inner = new LoopNest, *outer = new LoopNest;
1260 inner->node = outer->node = node;
1261 inner->stage = outer->stage = stage;
1262 inner->tileable = outer->tileable = tileable && may_subtile();
1263 inner->vector_dim = outer->vector_dim = vector_dim;
1264 inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index;
1265 outer->size = size;
1266 outer->innermost = false;
1267 outer->parallel = true;
1268 outer->tileable = may_subtile();
1269
1270 // First make an inner loop representing a 1x1x1... tile
1271 inner->size.resize(size.size(), 1);
1272 inner->innermost = innermost;
1273 inner->children = children;
1274 inner->inlined = inlined;
1275 inner->bounds = bounds;
1276 inner->store_at = store_at;
1277
1278 auto b = inner->get_bounds(node)->make_copy();
1279
1280 // Then move factors from the outer loop to the inner loop
1281 auto parent_bounds = parent->get_bounds(node);
1282
1283 for (size_t i = 0; i < stage->loop.size(); i++) {
1284 int l = stage->loop[i].pure_dim;
1285
1286 int64_t outer_extent;
1287 if (l >= 0) {
1288 internal_assert(l < (int)tiling.size()) << l << " " << tiling.size() << "\n";
1289 outer_extent = tiling[l];
1290 } else {
1291 // RVars are moved inwards
1292 outer_extent = 1;
1293 }
1294
1295 inner->size[i] = (outer->size[i] + outer_extent - 1) / outer_extent;
1296
1297 // Recompute the outer size given the selected inner size
1298 outer_extent = (outer->size[i] + inner->size[i] - 1) / inner->size[i];
1299
1300 outer->size[i] = outer_extent;
1301 const auto &p = parent_bounds->loops(stage->index, i);
1302 int64_t min = p.min();
1303 int64_t extent = p.extent();
1304 extent = (extent + outer_extent - 1) / outer_extent;
1305
1306 // Pick a better representative loop iteration for the
1307 // inner loops.
1308 min += (outer_extent / 2) * extent;
1309 bool compile_time_constant_bounds = p.constant_extent() || ((outer_extent > 1) && stage->loop[i].pure);
1310 b->loops(stage->index, i) = Span(min, min + extent - 1, compile_time_constant_bounds);
1311 }
1312 outer->set_bounds(node, b);
1313
1314 outer->children.emplace_back(inner);
1315 return outer;
1316 }
1317
1318 // Return all possible ways to compute f in tiles somewhere within
1319 // this loop nest.
compute_in_tiles(const FunctionDAG::Node * f,const LoopNest * parent,const MachineParams & params,int v,bool in_realization) const1320 vector<IntrusivePtr<const LoopNest>> LoopNest::compute_in_tiles(const FunctionDAG::Node *f,
1321 const LoopNest *parent,
1322 const MachineParams ¶ms,
1323 int v,
1324 bool in_realization) const {
1325 internal_assert(f);
1326
1327 vector<IntrusivePtr<const LoopNest>> result;
1328
1329 // Some pruning to not waste time on terrible states
1330 if (parent) {
1331 const auto &bounds_here = get_bounds(f);
1332 const auto &bounds_at_parent = parent->get_bounds(f);
1333
1334 // Don't descend into loops that break our ability to
1335 // vectorize if we could have vectorized one level up.
1336 const auto &p = bounds_here->region_computed(v);
1337 const auto &p_parent = bounds_at_parent->region_computed(v);
1338 int64_t e = p.extent();
1339 int64_t ep = p_parent.extent();
1340 if (ep >= f->vector_size && e < f->vector_size) return result;
1341
1342 // Don't descend into loops if the bounds required don't
1343 // shrink.
1344 int64_t total_here = 1, total_at_parent = 1;
1345 for (int i = 0; i < f->dimensions; i++) {
1346 const auto &range_here = bounds_here->region_computed(i);
1347 const auto &range_at_parent = bounds_at_parent->region_computed(i);
1348 total_here *= range_here.extent();
1349 total_at_parent *= range_at_parent.extent();
1350 }
1351 if (total_here >= total_at_parent) return result;
1352 }
1353
1354 // Figure out which child we can fuse this into
1355 int child = -1;
1356 bool called_by_multiple_children = false;
1357 for (int i = 0; i < (int)children.size(); i++) {
1358 if (children[i]->calls(f)) {
1359 if (child != -1) {
1360 called_by_multiple_children = true;
1361 }
1362 child = i;
1363 }
1364 }
1365
1366 // Place the computation directly inside this loop (provided it's not a SIMD loop)
1367 if (!innermost &&
1368 (!in_realization ||
1369 size.empty() ||
1370 vector_dim == -1 ||
1371 size[vector_dim] == 1)) {
1372
1373 std::unique_ptr<LoopNest> r{new LoopNest};
1374 r->copy_from(*this);
1375 r->compute_here(f, true, v);
1376 if (!in_realization) {
1377 r->store_at.insert(f);
1378 } else {
1379 r->tileable = false;
1380 }
1381 result.emplace_back(r.release());
1382 }
1383
1384 if (f->is_output) {
1385 // Outputs must be compute_root, so we're done.
1386 return result;
1387 }
1388
1389 if (tileable) {
1390 // The root node is not tileable, so all tileable nodes have parents.
1391 internal_assert(parent != nullptr);
1392
1393 // Generate a list of tile sizes to try
1394 auto tilings = generate_tilings(size, (int)(size.size() - 1), 2, !in_realization);
1395
1396 if (tilings.size() > 10000) {
1397 aslog(0) << "Warning: lots of tilings: " << tilings.size() << "\n";
1398 }
1399
1400 for (auto t : tilings) {
1401 if (parallel) {
1402 const auto &l = stage->loop;
1403 // More pruning. Skip root-level tilings that
1404 // would leave too many cores idle, and root-level
1405 // tilings that would force serialization of
1406 // dimensions we have decided to parallelize over
1407 // in an earlier pass.
1408 int total = 1;
1409 size_t idx = 0;
1410 for (auto s : t) {
1411 if (l[idx].pure) {
1412 total *= s;
1413 }
1414 idx++;
1415 }
1416
1417 const double tasks_per_core = (double)total / params.parallelism;
1418 const double idle_cores = std::ceil(tasks_per_core) / tasks_per_core;
1419 if (idle_cores > 1.1) continue;
1420 }
1421
1422 // Tile this loop and place the computation at some coarser granularity
1423 LoopNest *inner = new LoopNest, *outer = new LoopNest;
1424 inner->node = outer->node = node;
1425 inner->stage = outer->stage = stage;
1426 inner->tileable = outer->tileable = tileable && may_subtile();
1427 inner->vector_dim = outer->vector_dim = vector_dim;
1428 inner->vectorized_loop_index = outer->vectorized_loop_index = vectorized_loop_index;
1429 outer->size = size;
1430 outer->innermost = false;
1431 outer->parallel = parallel;
1432 inner->parallel = false;
1433
1434 // First make an inner loop representing a 1x1x1... tile
1435 inner->size.resize(size.size(), 1);
1436 inner->innermost = innermost;
1437 inner->children = children;
1438 inner->inlined = inlined;
1439 inner->bounds = bounds;
1440 inner->store_at = store_at;
1441
1442 {
1443 auto b = inner->get_bounds(node)->make_copy();
1444
1445 // Then move factors from the outer loop to the inner loop
1446 auto parent_bounds = parent->get_bounds(node);
1447
1448 for (size_t i = 0; i < t.size(); i++) {
1449 int64_t outer_extent = t[i];
1450 inner->size[i] = (outer->size[i] + outer_extent - 1) / outer_extent;
1451 outer->size[i] = outer_extent;
1452 const auto &p = parent_bounds->loops(stage->index, i);
1453 int64_t min = p.min();
1454 int64_t original_extent = p.extent();
1455 int64_t inner_extent = (original_extent + outer_extent - 1) / outer_extent;
1456 // Pick a more representative loop iteration
1457 min += (outer_extent / 2) * inner_extent;
1458 bool compile_time_constant_extent =
1459 (p.constant_extent() || outer_extent > 1) &&
1460 (inner_extent == 1 || outer_extent == 1 || stage->index == 0);
1461 b->loops(stage->index, i) = Span(min, min + inner_extent - 1, compile_time_constant_extent);
1462 }
1463
1464 // Region_{computed/required} on outer is now
1465 // wrong, but it doesn't matter because consumers
1466 // only look at the loops in get_bounds. Still,
1467 // this is weird.
1468 outer->set_bounds(node, b);
1469 }
1470
1471 if (!in_realization) {
1472 outer->store_at.insert(f);
1473 }
1474 outer->children.emplace_back(inner);
1475
1476 // HACK
1477 // bool may_slide = false;
1478 bool may_slide = (!in_realization &&
1479 f->stages.size() == 1);
1480 if (may_slide) {
1481 // Store here, but compute further in. Currently
1482 // don't have to worry about the constraints this
1483 // places on parallelism, as we forced all the
1484 // parallelism to the outer loop.
1485 auto opts = inner->compute_in_tiles(f, outer, params, v, true);
1486 for (IntrusivePtr<const LoopNest> &n : opts) {
1487 LoopNest *store_at_outer_compute_further_in = new LoopNest;
1488 store_at_outer_compute_further_in->copy_from(*outer);
1489 store_at_outer_compute_further_in->children.pop_back();
1490 store_at_outer_compute_further_in->children.emplace_back(std::move(n));
1491 result.emplace_back(store_at_outer_compute_further_in);
1492 }
1493 }
1494
1495 // Site the computation inside the outer loop
1496 outer->compute_here(f, true, v);
1497 outer->tileable &= !in_realization;
1498 result.emplace_back(outer);
1499 }
1500 }
1501
1502 if (child >= 0 && !called_by_multiple_children && !in_realization &&
1503 (may_subtile() || is_root())) {
1504 // Push the Func further inwards in the loop nest
1505
1506 // See if it's appropriate to slide over this loop Can't
1507 // slide at the root level if we intend to parallelize it.
1508 bool may_slide = (params.parallelism == 1) || !is_root();
1509
1510 const auto &c = children[child];
1511 int num_ones = 0;
1512 for (size_t i = 0; i < c->size.size(); i++) {
1513 int64_t s = c->size[i];
1514 num_ones += (s == 1) ? 1 : 0;
1515 }
1516
1517 // Some pruning:
1518
1519 // Only slide over single-dimensional loops
1520 may_slide &= num_ones == ((int)c->size.size() - 1);
1521
1522 // Don't slide funcs with update stages
1523 may_slide &= f->stages.size() == 1;
1524
1525 // Don't slide over the vector dimension
1526 may_slide &= (c->vectorized_loop_index == -1 ||
1527 c->size[c->vectorized_loop_index] == 1);
1528
1529 for (int store_here = 0; store_here < 2; store_here++) {
1530 if (store_here && !may_slide) {
1531 // We place all our parallel loops at the root
1532 // level, so this would constrain parallelism.
1533 continue;
1534 }
1535 if (is_root() && num_ones == (int)c->size.size() && params.parallelism > 1) {
1536 // Don't fuse into serial loops, or we could never parallelize this Func.
1537 continue;
1538 }
1539 auto opts = children[child]->compute_in_tiles(f, this, params, v, store_here);
1540 for (IntrusivePtr<const LoopNest> &n : opts) {
1541 // (Only valid if one child calls f) Push the
1542 // computation into the child. Possibly leaving
1543 // the storage out here.
1544 LoopNest *r = new LoopNest;
1545 r->copy_from(*this);
1546 if (store_here) {
1547 r->store_at.insert(f);
1548 }
1549 r->children[child] = n;
1550 result.emplace_back(r);
1551 }
1552 }
1553 }
1554
1555 return result;
1556 }
1557
1558 // Apply the schedule represented by this loop nest to a Halide pipeline.
apply(LoopLevel here,StageMap<std::unique_ptr<StageScheduleState>> & state_map,double num_cores,int depth,const LoopNest * parent,const LoopNest * compute_site) const1559 void LoopNest::apply(LoopLevel here,
1560 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
1561 double num_cores,
1562 int depth,
1563 const LoopNest *parent,
1564 const LoopNest *compute_site) const {
1565 if (is_root()) {
1566 for (auto &c : children) {
1567 Func(c->node->func).compute_root();
1568 c->apply(LoopLevel::root(), state_map, num_cores, 1, this, c.get());
1569 if (c->stage->index == 0) {
1570 auto &state = state_map.get(c->stage);
1571 state->schedule_source << "\n .compute_root()";
1572 // TODO: Omitting logic for printing store_root() assumes everything store_root is also compute root
1573 }
1574 }
1575 } else {
1576 // Non-root nodes always have parents.
1577 internal_assert(parent != nullptr);
1578
1579 if (parent->node != node) {
1580 compute_site = this;
1581 }
1582
1583 const auto &symbolic_loop = stage->loop;
1584 const auto &parent_bounds = parent->get_bounds(node);
1585 if (!state_map.contains(stage)) {
1586 StageScheduleState *state = new StageScheduleState;
1587 state->num_cores = num_cores;
1588 state->vector_dim = vector_dim;
1589 state->vectorized_loop_index = vectorized_loop_index;
1590 for (size_t i = 0; i < symbolic_loop.size(); i++) {
1591 StageScheduleState::FuncVar fv;
1592 const auto &l = symbolic_loop[i];
1593 fv.var = VarOrRVar(l.var, !l.pure);
1594 fv.orig = fv.var;
1595 fv.accessor = l.accessor;
1596 const auto &p = parent_bounds->loops(stage->index, i);
1597 fv.extent = p.extent();
1598 fv.constant_extent = p.constant_extent();
1599 fv.outermost = true;
1600 fv.parallel = l.pure && parallel;
1601 fv.exists = true;
1602 fv.pure = l.pure;
1603 fv.index = i;
1604 fv.innermost_pure_dim = (i == (size_t)vectorized_loop_index);
1605 state->vars.push_back(fv);
1606 }
1607 // Bubble the innermost pure dimension to the front of the pure dimensions
1608 for (int i = vectorized_loop_index - 1;
1609 i >= 0 && state->vars[i].pure; i--) {
1610 std::swap(state->vars[i], state->vars[i + 1]);
1611 }
1612 state_map.emplace(stage, std::unique_ptr<StageScheduleState>(state));
1613 }
1614 auto &state = *(state_map.get(stage));
1615
1616 // The getter for grabbing Func handles is reverse topological order
1617 Stage s = Func(node->func);
1618 if (stage->index > 0) {
1619 s = Func(node->func).update(stage->index - 1);
1620 }
1621
1622 if (stage->index == 0 && parent->node != node) {
1623 // Pick a memory type
1624 double bytes = node->bytes_per_point;
1625 for (int i = 0; i < node->dimensions; i++) {
1626 const auto &p = parent_bounds->region_computed(i);
1627 bytes *= p.extent();
1628 }
1629 if (bytes < 64000 && depth > 2) {
1630 // If it's probably a small allocation, and it's
1631 // made more than once, use stack-scoped
1632 // storage. Otherwise let the compiler pick heap
1633 // or stack as it likes.
1634 Func(node->func).store_in(MemoryType::Stack);
1635 state.schedule_source << "\n .store_in(MemoryType::Stack)";
1636 }
1637 }
1638
1639 // Pick a tail strategy for any splits of pure vars. RVars always use guardwithif
1640 auto pure_var_tail_strategy = TailStrategy::Auto;
1641 if (!compute_site->accesses_input_buffer() && !node->is_output) {
1642 // Roundup is lowest overhead, provided it doesn't
1643 // expand the bounds read on the input or written on
1644 // the output. However, you can only really use it on
1645 // pure stages that don't access the input anywhere in
1646 // their loop nest.
1647 pure_var_tail_strategy = TailStrategy::RoundUp;
1648 } else if (stage->index == 0) {
1649 // Pure stages that access the input use shiftinwards
1650 pure_var_tail_strategy = TailStrategy::ShiftInwards;
1651 } else {
1652 // For pure vars in update stages that access the
1653 // input, it's not safe to round up or redundantly
1654 // recompute
1655 pure_var_tail_strategy = TailStrategy::GuardWithIf;
1656 }
1657
1658 if (!size.empty()) {
1659 if (innermost) {
1660 if (vectorized_loop_index >= 0) {
1661 size_t i = 0;
1662 while (!state.vars[i].innermost_pure_dim)
1663 i++;
1664 auto &v = state.vars[i];
1665 internal_assert(v.innermost_pure_dim && v.exists) << v.var.name() << "\n";
1666 // Is the result of a split
1667 state.schedule_source
1668 << "\n .vectorize(" << v.var.name() << ")";
1669 s.vectorize(v.var);
1670 }
1671 } else {
1672 // Grab the innermost loop for this node
1673 const LoopNest *innermost_loop = this, *child = nullptr;
1674 while (!innermost_loop->innermost) {
1675 for (const auto &c : innermost_loop->children) {
1676 if (c->node == node) {
1677 if (!child) {
1678 child = c.get();
1679 }
1680 innermost_loop = c.get();
1681 break;
1682 }
1683 }
1684 }
1685
1686 // Do the implied splits
1687 vector<StageScheduleState::FuncVar> new_inner;
1688 for (size_t i = 0; i < symbolic_loop.size(); i++) {
1689 StageScheduleState::FuncVar v;
1690 StageScheduleState::FuncVar &parent = state.vars[i];
1691
1692 int64_t factor = (parent.extent + size[parent.index] - 1) / size[parent.index];
1693 int64_t innermost_size = innermost_loop->size[parent.index];
1694
1695 if (child && parent.innermost_pure_dim) {
1696 // Ensure the split is a multiple of the
1697 // vector size. With all these rounded
1698 // divs going on it can drift.
1699 factor = ((factor + innermost_size - 1) / innermost_size) * innermost_size;
1700 }
1701
1702 if (child && innermost_size > factor) {
1703 factor = innermost_size;
1704 }
1705
1706 if (!parent.exists || factor == 1) {
1707 v.exists = false;
1708 v.extent = 1;
1709 } else if (size[parent.index] == 1 && !(child &&
1710 child->innermost &&
1711 parent.innermost_pure_dim &&
1712 parent.var.name() == parent.orig.name())) {
1713 // Not split in this dimension
1714 v = parent;
1715 v.parallel = false;
1716 parent.exists = false;
1717 parent.extent = 1;
1718 } else {
1719 VarOrRVar inner(Var(parent.var.name() + "i"));
1720 if (parent.var.is_rvar) {
1721 inner = RVar(parent.var.name() + "i");
1722 }
1723
1724 auto tail_strategy = pure_var_tail_strategy;
1725 // If it's an RVar, or not the outermost split and we're in an update, we need a guard with if instead.
1726 if (parent.var.is_rvar || (stage->index != 0 && !parent.outermost)) {
1727 tail_strategy = TailStrategy::GuardWithIf;
1728 }
1729
1730 if (factor > parent.extent && tail_strategy == TailStrategy::ShiftInwards) {
1731 // Don't shift all the way off the image.
1732 tail_strategy = TailStrategy::GuardWithIf;
1733 }
1734
1735 s.split(parent.var, parent.var, inner, (int)factor, tail_strategy);
1736 state.schedule_source
1737 << "\n .split("
1738 << parent.var.name() << ", "
1739 << parent.var.name() << ", "
1740 << inner.name() << ", "
1741 << factor << ", "
1742 << "TailStrategy::" << tail_strategy << ")";
1743 v = parent;
1744 parent.extent = size[parent.index];
1745 v.constant_extent = (tail_strategy != TailStrategy::GuardWithIf);
1746 v.var = inner;
1747 v.accessor.clear();
1748 v.extent = factor;
1749 v.parallel = false;
1750 v.outermost = false;
1751 }
1752 new_inner.push_back(v);
1753 }
1754
1755 if (child->innermost) {
1756 // Maybe do some unrolling
1757
1758 int64_t product_of_pure_loops = 1;
1759 bool all_pure_loops_constant_size = true;
1760 for (size_t i = 0; i < symbolic_loop.size(); i++) {
1761 if (state.vars[i].pure) {
1762 product_of_pure_loops *= state.vars[i].extent;
1763 all_pure_loops_constant_size &= state.vars[i].constant_extent;
1764 }
1765 }
1766
1767 if (product_of_pure_loops <= kUnrollLimit && all_pure_loops_constant_size) {
1768 // There's a hope we can fit anything compute-at this level into registers if we fully unroll
1769 // TODO: 16 should be the number of vector registers in the architecture
1770 std::stable_sort(state.vars.begin(), state.vars.begin() + symbolic_loop.size(),
1771 [](const StageScheduleState::FuncVar &a, const StageScheduleState::FuncVar &b) {
1772 return a.pure && !b.pure;
1773 });
1774
1775 for (size_t i = 0; i < symbolic_loop.size(); i++) {
1776 if (state.vars[i].pure && state.vars[i].exists && state.vars[i].extent > 1) {
1777 s.unroll(state.vars[i].var);
1778 state.schedule_source << "\n .unroll(" << state.vars[i].var.name() << ")";
1779 }
1780 }
1781 }
1782 }
1783
1784 bool found = false;
1785 for (const auto &v : state.vars) {
1786 if (!v.exists) continue;
1787 here = LoopLevel(node->func, v.var);
1788 found = true;
1789 break;
1790 }
1791 if (!found) {
1792 here = LoopLevel(node->func, Var::outermost());
1793 }
1794 // internal_assert(found) << "Could not find appropriate compute_at location for children of " << node->func.name() << "\n";
1795 state.vars.insert(state.vars.begin(), new_inner.begin(), new_inner.end());
1796 }
1797 }
1798 if (innermost) {
1799 internal_assert(store_at.empty());
1800 internal_assert(children.empty());
1801 return;
1802 }
1803
1804 for (auto f : store_at) {
1805 Func(f->func).store_at(here);
1806 }
1807 for (auto s : size) {
1808 num_cores /= s;
1809 }
1810 here.lock();
1811 string loop_level;
1812 if (here.is_root()) {
1813 loop_level = "_root()";
1814 } else {
1815 loop_level = "_at(" + here.func() + ", " + here.var().name() + ")";
1816 }
1817 for (auto &c : children) {
1818 if (c->node != node) {
1819 Func(c->node->func).compute_at(here);
1820 }
1821 c->apply(here, state_map, num_cores, depth + 1, this, compute_site);
1822 if (c->node != node && c->stage->index == 0) {
1823 auto &state = *(state_map.get(c->stage));
1824 state.schedule_source << "\n .compute" << loop_level;
1825 }
1826 }
1827 for (auto f : store_at) {
1828 bool computed_here = false;
1829 for (auto &c : children) {
1830 if (c->node == f) {
1831 computed_here = true;
1832 break;
1833 }
1834 }
1835 if (!computed_here) {
1836 auto &state = *(state_map.get(&(f->stages[0])));
1837 state.schedule_source << "\n .store" << loop_level;
1838 }
1839 }
1840 }
1841 }
1842
1843 } // namespace Autoscheduler
1844 } // namespace Internal
1845 } // namespace Halide
1846