1 /** \file
2  * Defines a Buffer type that wraps from halide_buffer_t and adds
3  * functionality, and methods for more conveniently iterating over the
4  * samples in a halide_buffer_t outside of Halide code. */
5 
6 #ifndef HALIDE_RUNTIME_BUFFER_H
7 #define HALIDE_RUNTIME_BUFFER_H
8 
9 #include <algorithm>
10 #include <atomic>
11 #include <cassert>
12 #include <limits>
13 #include <memory>
14 #include <stdint.h>
15 #include <string.h>
16 #include <vector>
17 
18 #if defined(__has_feature)
19 #if __has_feature(memory_sanitizer)
20 #include <sanitizer/msan_interface.h>
21 #endif
22 #endif
23 
24 #include "HalideRuntime.h"
25 
26 #ifdef _MSC_VER
27 #include <malloc.h>
28 #define HALIDE_ALLOCA _alloca
29 #else
30 #define HALIDE_ALLOCA __builtin_alloca
31 #endif
32 
33 // gcc 5.1 has a false positive warning on this code
34 #if __GNUC__ == 5 && __GNUC_MINOR__ == 1
35 #pragma GCC diagnostic ignored "-Warray-bounds"
36 #endif
37 
38 namespace Halide {
39 namespace Runtime {
40 
41 // Forward-declare our Buffer class
42 template<typename T, int D>
43 class Buffer;
44 
45 // A helper to check if a parameter pack is entirely implicitly
46 // int-convertible to use with std::enable_if
47 template<typename... Args>
48 struct AllInts : std::false_type {};
49 
50 template<>
51 struct AllInts<> : std::true_type {};
52 
53 template<typename T, typename... Args>
54 struct AllInts<T, Args...> {
55     static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value;
56 };
57 
58 // Floats and doubles are technically implicitly int-convertible, but
59 // doing so produces a warning we treat as an error, so just disallow
60 // it here.
61 template<typename... Args>
62 struct AllInts<float, Args...> : std::false_type {};
63 
64 template<typename... Args>
65 struct AllInts<double, Args...> : std::false_type {};
66 
67 // A helper to detect if there are any zeros in a container
68 namespace Internal {
69 template<typename Container>
70 bool any_zero(const Container &c) {
71     for (int i : c) {
72         if (i == 0) return true;
73     }
74     return false;
75 }
76 }  // namespace Internal
77 
78 /** A struct acting as a header for allocations owned by the Buffer
79  * class itself. */
80 struct AllocationHeader {
81     void (*deallocate_fn)(void *);
82     std::atomic<int> ref_count;
83 
84     // Note that ref_count always starts at 1
85     AllocationHeader(void (*deallocate_fn)(void *))
86         : deallocate_fn(deallocate_fn), ref_count(1) {
87     }
88 };
89 
90 /** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
91 enum struct BufferDeviceOwnership : int {
92     Allocated,               ///> halide_device_free will be called when device ref count goes to zero
93     WrappedNative,           ///> halide_device_detach_native will be called when device ref count goes to zero
94     Unmanaged,               ///> No free routine will be called when device ref count goes to zero
95     AllocatedDeviceAndHost,  ///> Call device_and_host_free when DevRefCount goes to zero.
96     Cropped,                 ///> Call halide_device_release_crop when DevRefCount goes to zero.
97 };
98 
99 /** A similar struct for managing device allocations. */
100 struct DeviceRefCount {
101     // This is only ever constructed when there's something to manage,
102     // so start at one.
103     std::atomic<int> count{1};
104     BufferDeviceOwnership ownership{BufferDeviceOwnership::Allocated};
105 };
106 
107 /** A templated Buffer class that wraps halide_buffer_t and adds
108  * functionality. When using Halide from C++, this is the preferred
109  * way to create input and output buffers. The overhead of using this
110  * class relative to a naked halide_buffer_t is minimal - it uses another
111  * ~16 bytes on the stack, and does no dynamic allocations when using
112  * it to represent existing memory of a known maximum dimensionality.
113  *
114  * The template parameter T is the element type. For buffers where the
115  * element type is unknown, or may vary, use void or const void.
116  *
117  * D is the maximum number of dimensions that can be represented using
118  * space inside the class itself. Set it to the maximum dimensionality
119  * you expect this buffer to be. If the actual dimensionality exceeds
120  * this, heap storage is allocated to track the shape of the buffer. D
121  * defaults to 4, which should cover nearly all usage.
122  *
123  * The class optionally allocates and owns memory for the image using
124  * a shared pointer allocated with the provided allocator. If they are
125  * null, malloc and free are used.  Any device-side allocation is
126  * considered as owned if and only if the host-side allocation is
127  * owned. */
128 template<typename T = void, int D = 4>
129 class Buffer {
130     /** The underlying halide_buffer_t */
131     halide_buffer_t buf = {0};
132 
133     /** Some in-class storage for shape of the dimensions. */
134     halide_dimension_t shape[D];
135 
136     /** The allocation owned by this Buffer. NULL if the Buffer does not
137      * own the memory. */
138     AllocationHeader *alloc = nullptr;
139 
140     /** A reference count for the device allocation owned by this
141      * buffer. */
142     mutable DeviceRefCount *dev_ref_count = nullptr;
143 
144     /** True if T is of type void or const void */
145     static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value;
146 
147     /** A type function that adds a const qualifier if T is a const type. */
148     template<typename T2>
149     using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type;
150 
151     /** T unless T is (const) void, in which case (const)
152      * uint8_t. Useful for providing return types for operator() */
153     using not_void_T = typename std::conditional<T_is_void,
154                                                  add_const_if_T_is_const<uint8_t>,
155                                                  T>::type;
156 
157     /** T with constness removed. Useful for return type of copy(). */
158     using not_const_T = typename std::remove_const<T>::type;
159 
160     /** The type the elements are stored as. Equal to not_void_T
161      * unless T is a pointer, in which case uint64_t. Halide stores
162      * all pointer types as uint64s internally, even on 32-bit
163      * systems. */
164     using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type;
165 
166 public:
167     /** True if the Halide type is not void (or const void). */
168     static constexpr bool has_static_halide_type = !T_is_void;
169 
170     /** Get the Halide type of T. Callers should not use the result if
171      * has_static_halide_type is false. */
172     static halide_type_t static_halide_type() {
173         return halide_type_of<typename std::remove_cv<not_void_T>::type>();
174     }
175 
176     /** Does this Buffer own the host memory it refers to? */
177     bool owns_host_memory() const {
178         return alloc != nullptr;
179     }
180 
181 private:
182     /** Increment the reference count of any owned allocation */
183     void incref() const {
184         if (owns_host_memory()) {
185             alloc->ref_count++;
186         }
187         if (buf.device) {
188             if (!dev_ref_count) {
189                 // I seem to have a non-zero dev field but no
190                 // reference count for it. I must have been given a
191                 // device allocation by a Halide pipeline, and have
192                 // never been copied from since. Take sole ownership
193                 // of it.
194                 dev_ref_count = new DeviceRefCount;
195             }
196             dev_ref_count->count++;
197         }
198     }
199 
200     // Note that this is called "cropped" but can also encompass a slice/embed
201     // operation as well.
202     struct DevRefCountCropped : DeviceRefCount {
203         Buffer<T, D> cropped_from;
204         DevRefCountCropped(const Buffer<T, D> &cropped_from)
205             : cropped_from(cropped_from) {
206             ownership = BufferDeviceOwnership::Cropped;
207         }
208     };
209 
210     /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
211     void crop_from(const Buffer<T, D> &cropped_from) {
212         assert(dev_ref_count == nullptr);
213         dev_ref_count = new DevRefCountCropped(cropped_from);
214     }
215 
216     /** Decrement the reference count of any owned allocation and free host
217      * and device memory if it hits zero. Sets alloc to nullptr. */
218     void decref() {
219         if (owns_host_memory()) {
220             int new_count = --(alloc->ref_count);
221             if (new_count == 0) {
222                 void (*fn)(void *) = alloc->deallocate_fn;
223                 alloc->~AllocationHeader();
224                 fn(alloc);
225             }
226             buf.host = nullptr;
227             alloc = nullptr;
228             set_host_dirty(false);
229         }
230         decref_dev();
231     }
232 
233     void decref_dev() {
234         int new_count = 0;
235         if (dev_ref_count) {
236             new_count = --(dev_ref_count->count);
237         }
238         if (new_count == 0) {
239             if (buf.device) {
240                 assert(!(alloc && device_dirty()) &&
241                        "Implicitly freeing a dirty device allocation while a host allocation still lives. "
242                        "Call device_free explicitly if you want to drop dirty device-side data. "
243                        "Call copy_to_host explicitly if you want the data copied to the host allocation "
244                        "before the device allocation is freed.");
245                 if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
246                     buf.device_interface->detach_native(nullptr, &buf);
247                 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
248                     buf.device_interface->device_and_host_free(nullptr, &buf);
249                 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
250                     buf.device_interface->device_release_crop(nullptr, &buf);
251                 } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
252                     buf.device_interface->device_free(nullptr, &buf);
253                 }
254             }
255             if (dev_ref_count) {
256                 if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
257                     delete (DevRefCountCropped *)dev_ref_count;
258                 } else {
259                     delete dev_ref_count;
260                 }
261             }
262         }
263         buf.device = 0;
264         buf.device_interface = nullptr;
265         dev_ref_count = nullptr;
266     }
267 
268     void free_shape_storage() {
269         if (buf.dim != shape) {
270             delete[] buf.dim;
271             buf.dim = nullptr;
272         }
273     }
274 
275     void make_shape_storage(const int dimensions) {
276         // This should usually be inlined, so if dimensions is statically known,
277         // we can skip the call to new
278         buf.dimensions = dimensions;
279         buf.dim = (dimensions <= D) ? shape : new halide_dimension_t[dimensions];
280     }
281 
282     void copy_shape_from(const halide_buffer_t &other) {
283         // All callers of this ensure that buf.dimensions == other.dimensions.
284         make_shape_storage(other.dimensions);
285         std::copy(other.dim, other.dim + other.dimensions, buf.dim);
286     }
287 
288     template<typename T2, int D2>
289     void move_shape_from(Buffer<T2, D2> &&other) {
290         if (other.shape == other.buf.dim) {
291             copy_shape_from(other.buf);
292         } else {
293             buf.dim = other.buf.dim;
294             other.buf.dim = nullptr;
295         }
296     }
297 
298     /** Initialize the shape from a halide_buffer_t. */
299     void initialize_from_buffer(const halide_buffer_t &b,
300                                 BufferDeviceOwnership ownership) {
301         memcpy(&buf, &b, sizeof(halide_buffer_t));
302         copy_shape_from(b);
303         if (b.device) {
304             dev_ref_count = new DeviceRefCount;
305             dev_ref_count->ownership = ownership;
306         }
307     }
308 
309     /** Initialize the shape from an array of ints */
310     void initialize_shape(const int *sizes) {
311         for (int i = 0; i < buf.dimensions; i++) {
312             buf.dim[i].min = 0;
313             buf.dim[i].extent = sizes[i];
314             if (i == 0) {
315                 buf.dim[i].stride = 1;
316             } else {
317                 buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
318             }
319         }
320     }
321 
322     /** Initialize the shape from a vector of extents */
323     void initialize_shape(const std::vector<int> &sizes) {
324         assert(buf.dimensions == (int)sizes.size());
325         initialize_shape(sizes.data());
326     }
327 
328     /** Initialize the shape from the static shape of an array */
329     template<typename Array, size_t N>
330     void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
331         buf.dim[next].min = 0;
332         buf.dim[next].extent = (int)N;
333         if (next == 0) {
334             buf.dim[next].stride = 1;
335         } else {
336             initialize_shape_from_array_shape(next - 1, vals[0]);
337             buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
338         }
339     }
340 
341     /** Base case for the template recursion above. */
342     template<typename T2>
343     void initialize_shape_from_array_shape(int, const T2 &) {
344     }
345 
346     /** Get the dimensionality of a multi-dimensional C array */
347     template<typename Array, size_t N>
348     static int dimensionality_of_array(Array (&vals)[N]) {
349         return dimensionality_of_array(vals[0]) + 1;
350     }
351 
352     template<typename T2>
353     static int dimensionality_of_array(const T2 &) {
354         return 0;
355     }
356 
357     /** Get the underlying halide_type_t of an array's element type. */
358     template<typename Array, size_t N>
359     static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
360         return scalar_type_of_array(vals[0]);
361     }
362 
363     template<typename T2>
364     static halide_type_t scalar_type_of_array(const T2 &) {
365         return halide_type_of<typename std::remove_cv<T2>::type>();
366     }
367 
368     /** Crop a single dimension without handling device allocation. */
369     void crop_host(int d, int min, int extent) {
370         assert(dim(d).min() <= min);
371         assert(dim(d).max() >= min + extent - 1);
372         int shift = min - dim(d).min();
373         if (buf.host != nullptr) {
374             buf.host += shift * dim(d).stride() * type().bytes();
375         }
376         buf.dim[d].min = min;
377         buf.dim[d].extent = extent;
378     }
379 
380     /** Crop as many dimensions as are in rect, without handling device allocation. */
381     void crop_host(const std::vector<std::pair<int, int>> &rect) {
382         assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
383         int limit = (int)rect.size();
384         assert(limit <= dimensions());
385         for (int i = 0; i < limit; i++) {
386             crop_host(i, rect[i].first, rect[i].second);
387         }
388     }
389 
390     void complete_device_crop(Buffer<T, D> &result_host_cropped) const {
391         assert(buf.device_interface != nullptr);
392         if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == 0) {
393             const Buffer<T, D> *cropped_from = this;
394             // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
395             // is it possible to get to this point without incref having run at least once since
396             // the device field was set? (I.e. in the internal logic of crop. incref might have been
397             // called.)
398             if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
399                 cropped_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
400             }
401             result_host_cropped.crop_from(*cropped_from);
402         }
403     }
404 
405     /** slice a single dimension without handling device allocation. */
406     void slice_host(int d, int pos) {
407         assert(d >= 0 && d < dimensions());
408         assert(pos >= dim(d).min() && pos <= dim(d).max());
409         buf.dimensions--;
410         int shift = pos - buf.dim[d].min;
411         if (buf.host != nullptr) {
412             buf.host += shift * buf.dim[d].stride * type().bytes();
413         }
414         for (int i = d; i < buf.dimensions; i++) {
415             buf.dim[i] = buf.dim[i + 1];
416         }
417         buf.dim[buf.dimensions] = {0, 0, 0};
418     }
419 
420     void complete_device_slice(Buffer<T, D> &result_host_sliced, int d, int pos) const {
421         assert(buf.device_interface != nullptr);
422         if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == 0) {
423             const Buffer<T, D> *sliced_from = this;
424             // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
425             // is it possible to get to this point without incref having run at least once since
426             // the device field was set? (I.e. in the internal logic of slice. incref might have been
427             // called.)
428             if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
429                 sliced_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
430             }
431             // crop_from() is correct here, despite the fact that we are slicing.
432             result_host_sliced.crop_from(*sliced_from);
433         }
434     }
435 
436 public:
437     typedef T ElemType;
438 
439     /** Read-only access to the shape */
440     class Dimension {
441         const halide_dimension_t &d;
442 
443     public:
444         /** The lowest coordinate in this dimension */
445         HALIDE_ALWAYS_INLINE int min() const {
446             return d.min;
447         }
448 
449         /** The number of elements in memory you have to step over to
450          * increment this coordinate by one. */
451         HALIDE_ALWAYS_INLINE int stride() const {
452             return d.stride;
453         }
454 
455         /** The extent of the image along this dimension */
456         HALIDE_ALWAYS_INLINE int extent() const {
457             return d.extent;
458         }
459 
460         /** The highest coordinate in this dimension */
461         HALIDE_ALWAYS_INLINE int max() const {
462             return min() + extent() - 1;
463         }
464 
465         /** An iterator class, so that you can iterate over
466          * coordinates in a dimensions using a range-based for loop. */
467         struct iterator {
468             int val;
469             int operator*() const {
470                 return val;
471             }
472             bool operator!=(const iterator &other) const {
473                 return val != other.val;
474             }
475             iterator &operator++() {
476                 val++;
477                 return *this;
478             }
479         };
480 
481         /** An iterator that points to the min coordinate */
482         HALIDE_ALWAYS_INLINE iterator begin() const {
483             return {min()};
484         }
485 
486         /** An iterator that points to one past the max coordinate */
487         HALIDE_ALWAYS_INLINE iterator end() const {
488             return {min() + extent()};
489         }
490 
491         Dimension(const halide_dimension_t &dim)
492             : d(dim){};
493     };
494 
495     /** Access the shape of the buffer */
496     HALIDE_ALWAYS_INLINE Dimension dim(int i) const {
497         assert(i >= 0 && i < this->dimensions());
498         return Dimension(buf.dim[i]);
499     }
500 
501     /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
502     // @{
503     int min(int i) const {
504         return dim(i).min();
505     }
506     int extent(int i) const {
507         return dim(i).extent();
508     }
509     int stride(int i) const {
510         return dim(i).stride();
511     }
512     // @}
513 
514     /** The total number of elements this buffer represents. Equal to
515      * the product of the extents */
516     size_t number_of_elements() const {
517         size_t s = 1;
518         for (int i = 0; i < dimensions(); i++) {
519             s *= dim(i).extent();
520         }
521         return s;
522     }
523 
524     /** Get the dimensionality of the buffer. */
525     int dimensions() const {
526         return buf.dimensions;
527     }
528 
529     /** Get the type of the elements. */
530     halide_type_t type() const {
531         return buf.type;
532     }
533 
534 private:
535     /** Offset to the element with the lowest address. If all
536      * strides are positive, equal to zero. Offset is in elements, not bytes. */
537     ptrdiff_t begin_offset() const {
538         ptrdiff_t index = 0;
539         for (int i = 0; i < dimensions(); i++) {
540             if (dim(i).stride() < 0) {
541                 index += dim(i).stride() * (dim(i).extent() - 1);
542             }
543         }
544         return index;
545     }
546 
547     /** An offset to one beyond the element with the highest address.
548      * Offset is in elements, not bytes. */
549     ptrdiff_t end_offset() const {
550         ptrdiff_t index = 0;
551         for (int i = 0; i < dimensions(); i++) {
552             if (dim(i).stride() > 0) {
553                 index += dim(i).stride() * (dim(i).extent() - 1);
554             }
555         }
556         index += 1;
557         return index;
558     }
559 
560 public:
561     /** A pointer to the element with the lowest address. If all
562      * strides are positive, equal to the host pointer. */
563     T *begin() const {
564         assert(buf.host != nullptr);  // Cannot call begin() on an unallocated Buffer.
565         return (T *)(buf.host + begin_offset() * type().bytes());
566     }
567 
568     /** A pointer to one beyond the element with the highest address. */
569     T *end() const {
570         assert(buf.host != nullptr);  // Cannot call end() on an unallocated Buffer.
571         return (T *)(buf.host + end_offset() * type().bytes());
572     }
573 
574     /** The total number of bytes spanned by the data in memory. */
575     size_t size_in_bytes() const {
576         return (size_t)(end_offset() - begin_offset()) * type().bytes();
577     }
578 
579     /** Reset the Buffer to be equivalent to a default-constructed Buffer
580      * of the same static type (if any); Buffer<void> will have its runtime
581      * type reset to uint8. */
582     void reset() {
583         *this = Buffer();
584     }
585 
586     Buffer()
587         : shape() {
588         buf.type = static_halide_type();
589         make_shape_storage(0);
590     }
591 
592     /** Make a Buffer from a halide_buffer_t */
593     explicit Buffer(const halide_buffer_t &buf,
594                     BufferDeviceOwnership ownership = BufferDeviceOwnership::Unmanaged) {
595         assert(T_is_void || buf.type == static_halide_type());
596         initialize_from_buffer(buf, ownership);
597     }
598 
599     /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
600     template<typename T2, int D2>
601     friend class Buffer;
602 
603 private:
604     template<typename T2, int D2>
605     static void static_assert_can_convert_from() {
606         static_assert((!std::is_const<T2>::value || std::is_const<T>::value),
607                       "Can't convert from a Buffer<const T> to a Buffer<T>");
608         static_assert(std::is_same<typename std::remove_const<T>::type,
609                                    typename std::remove_const<T2>::type>::value ||
610                           T_is_void || Buffer<T2, D2>::T_is_void,
611                       "type mismatch constructing Buffer");
612     }
613 
614 public:
615     /** Determine if if an Buffer<T, D> can be constructed from some other Buffer type.
616      * If this can be determined at compile time, fail with a static assert; otherwise
617      * return a boolean based on runtime typing. */
618     template<typename T2, int D2>
619     static bool can_convert_from(const Buffer<T2, D2> &other) {
620         static_assert_can_convert_from<T2, D2>();
621         if (Buffer<T2, D2>::T_is_void && !T_is_void) {
622             return other.type() == static_halide_type();
623         }
624         return true;
625     }
626 
627     /** Fail an assertion at runtime or compile-time if an Buffer<T, D>
628      * cannot be constructed from some other Buffer type. */
629     template<typename T2, int D2>
630     static void assert_can_convert_from(const Buffer<T2, D2> &other) {
631         // Explicitly call static_assert_can_convert_from() here so
632         // that we always get compile-time checking, even if compiling with
633         // assertions disabled.
634         static_assert_can_convert_from<T2, D2>();
635         assert(can_convert_from(other));
636     }
637 
638     /** Copy constructor. Does not copy underlying data. */
639     Buffer(const Buffer<T, D> &other)
640         : buf(other.buf),
641           alloc(other.alloc) {
642         other.incref();
643         dev_ref_count = other.dev_ref_count;
644         copy_shape_from(other.buf);
645     }
646 
647     /** Construct a Buffer from a Buffer of different dimensionality
648      * and type. Asserts that the type matches (at runtime, if one of
649      * the types is void). Note that this constructor is
650      * implicit. This, for example, lets you pass things like
651      * Buffer<T> or Buffer<const void> to functions expected
652      * Buffer<const T>. */
653     template<typename T2, int D2>
654     Buffer(const Buffer<T2, D2> &other)
655         : buf(other.buf),
656           alloc(other.alloc) {
657         assert_can_convert_from(other);
658         other.incref();
659         dev_ref_count = other.dev_ref_count;
660         copy_shape_from(other.buf);
661     }
662 
663     /** Move constructor */
664     Buffer(Buffer<T, D> &&other) noexcept
665         : buf(other.buf),
666           alloc(other.alloc),
667           dev_ref_count(other.dev_ref_count) {
668         other.dev_ref_count = nullptr;
669         other.alloc = nullptr;
670         move_shape_from(std::forward<Buffer<T, D>>(other));
671         other.buf = halide_buffer_t();
672     }
673 
674     /** Move-construct a Buffer from a Buffer of different
675      * dimensionality and type. Asserts that the types match (at
676      * runtime if one of the types is void). */
677     template<typename T2, int D2>
678     Buffer(Buffer<T2, D2> &&other)
679         : buf(other.buf),
680           alloc(other.alloc),
681           dev_ref_count(other.dev_ref_count) {
682         assert_can_convert_from(other);
683         other.dev_ref_count = nullptr;
684         other.alloc = nullptr;
685         move_shape_from(std::forward<Buffer<T2, D2>>(other));
686         other.buf = halide_buffer_t();
687     }
688 
689     /** Assign from another Buffer of possibly-different
690      * dimensionality and type. Asserts that the types match (at
691      * runtime if one of the types is void). */
692     template<typename T2, int D2>
693     Buffer<T, D> &operator=(const Buffer<T2, D2> &other) {
694         if ((const void *)this == (const void *)&other) {
695             return *this;
696         }
697         assert_can_convert_from(other);
698         other.incref();
699         decref();
700         dev_ref_count = other.dev_ref_count;
701         alloc = other.alloc;
702         free_shape_storage();
703         buf = other.buf;
704         copy_shape_from(other.buf);
705         return *this;
706     }
707 
708     /** Standard assignment operator */
709     Buffer<T, D> &operator=(const Buffer<T, D> &other) {
710         if (this == &other) {
711             return *this;
712         }
713         other.incref();
714         decref();
715         dev_ref_count = other.dev_ref_count;
716         alloc = other.alloc;
717         free_shape_storage();
718         buf = other.buf;
719         copy_shape_from(other.buf);
720         return *this;
721     }
722 
723     /** Move from another Buffer of possibly-different
724      * dimensionality and type. Asserts that the types match (at
725      * runtime if one of the types is void). */
726     template<typename T2, int D2>
727     Buffer<T, D> &operator=(Buffer<T2, D2> &&other) {
728         assert_can_convert_from(other);
729         decref();
730         alloc = other.alloc;
731         other.alloc = nullptr;
732         dev_ref_count = other.dev_ref_count;
733         other.dev_ref_count = nullptr;
734         free_shape_storage();
735         buf = other.buf;
736         move_shape_from(std::forward<Buffer<T2, D2>>(other));
737         other.buf = halide_buffer_t();
738         return *this;
739     }
740 
741     /** Standard move-assignment operator */
742     Buffer<T, D> &operator=(Buffer<T, D> &&other) noexcept {
743         decref();
744         alloc = other.alloc;
745         other.alloc = nullptr;
746         dev_ref_count = other.dev_ref_count;
747         other.dev_ref_count = nullptr;
748         free_shape_storage();
749         buf = other.buf;
750         move_shape_from(std::forward<Buffer<T, D>>(other));
751         other.buf = halide_buffer_t();
752         return *this;
753     }
754 
755     /** Check the product of the extents fits in memory. */
756     void check_overflow() {
757         size_t size = type().bytes();
758         for (int i = 0; i < dimensions(); i++) {
759             size *= dim(i).extent();
760         }
761         // We allow 2^31 or 2^63 bytes, so drop the top bit.
762         size = (size << 1) >> 1;
763         for (int i = 0; i < dimensions(); i++) {
764             size /= dim(i).extent();
765         }
766         assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
767     }
768 
769     /** Allocate memory for this Buffer. Drops the reference to any
770      * owned memory. */
771     void allocate(void *(*allocate_fn)(size_t) = nullptr,
772                   void (*deallocate_fn)(void *) = nullptr) {
773         if (!allocate_fn) {
774             allocate_fn = malloc;
775         }
776         if (!deallocate_fn) {
777             deallocate_fn = free;
778         }
779 
780         // Drop any existing allocation
781         deallocate();
782 
783         // Conservatively align images to 128 bytes. This is enough
784         // alignment for all the platforms we might use.
785         size_t size = size_in_bytes();
786         const size_t alignment = 128;
787         size = (size + alignment - 1) & ~(alignment - 1);
788         void *alloc_storage = allocate_fn(size + sizeof(AllocationHeader) + alignment - 1);
789         alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
790         uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
791         buf.host = (uint8_t *)((uintptr_t)(unaligned_ptr + alignment - 1) & ~(alignment - 1));
792     }
793 
794     /** Drop reference to any owned host or device memory, possibly
795      * freeing it, if this buffer held the last reference to
796      * it. Retains the shape of the buffer. Does nothing if this
797      * buffer did not allocate its own memory. */
798     void deallocate() {
799         decref();
800     }
801 
802     /** Drop reference to any owned device memory, possibly freeing it
803      * if this buffer held the last reference to it. Asserts that
804      * device_dirty is false. */
805     void device_deallocate() {
806         decref_dev();
807     }
808 
809     /** Allocate a new image of the given size with a runtime
810      * type. Only used when you do know what size you want but you
811      * don't know statically what type the elements are. Pass zeroes
812      * to make a buffer suitable for bounds query calls. */
813     template<typename... Args,
814              typename = typename std::enable_if<AllInts<Args...>::value>::type>
815     Buffer(halide_type_t t, int first, Args... rest) {
816         if (!T_is_void) {
817             assert(static_halide_type() == t);
818         }
819         int extents[] = {first, (int)rest...};
820         buf.type = t;
821         constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
822         make_shape_storage(buf_dimensions);
823         initialize_shape(extents);
824         if (!Internal::any_zero(extents)) {
825             check_overflow();
826             allocate();
827         }
828     }
829 
830     /** Allocate a new image of the given size. Pass zeroes to make a
831      * buffer suitable for bounds query calls. */
832     // @{
833 
834     // The overload with one argument is 'explicit', so that
835     // (say) int is not implicitly convertable to Buffer<int>
836     explicit Buffer(int first) {
837         static_assert(!T_is_void,
838                       "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
839         int extents[] = {first};
840         buf.type = static_halide_type();
841         constexpr int buf_dimensions = 1;
842         make_shape_storage(buf_dimensions);
843         initialize_shape(extents);
844         if (first != 0) {
845             check_overflow();
846             allocate();
847         }
848     }
849 
850     template<typename... Args,
851              typename = typename std::enable_if<AllInts<Args...>::value>::type>
852     Buffer(int first, int second, Args... rest) {
853         static_assert(!T_is_void,
854                       "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
855         int extents[] = {first, second, (int)rest...};
856         buf.type = static_halide_type();
857         constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
858         make_shape_storage(buf_dimensions);
859         initialize_shape(extents);
860         if (!Internal::any_zero(extents)) {
861             check_overflow();
862             allocate();
863         }
864     }
865     // @}
866 
867     /** Allocate a new image of unknown type using a vector of ints as the size. */
868     Buffer(halide_type_t t, const std::vector<int> &sizes) {
869         if (!T_is_void) {
870             assert(static_halide_type() == t);
871         }
872         buf.type = t;
873         make_shape_storage((int)sizes.size());
874         initialize_shape(sizes);
875         if (!Internal::any_zero(sizes)) {
876             check_overflow();
877             allocate();
878         }
879     }
880 
881     /** Allocate a new image of known type using a vector of ints as the size. */
882     explicit Buffer(const std::vector<int> &sizes)
883         : Buffer(static_halide_type(), sizes) {
884     }
885 
886 private:
887     // Create a copy of the sizes vector, ordered as specified by order.
888     static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
889         assert(order.size() == sizes.size());
890         std::vector<int> ordered_sizes(sizes.size());
891         for (size_t i = 0; i < sizes.size(); ++i) {
892             ordered_sizes[i] = sizes.at(order[i]);
893         }
894         return ordered_sizes;
895     }
896 
897 public:
898     /** Allocate a new image of unknown type using a vector of ints as the size and
899      * a vector of indices indicating the storage order for each dimension. The
900      * length of the sizes vector and the storage-order vector must match. For instance,
901      * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
902     Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
903         : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
904         transpose(storage_order);
905     }
906 
907     Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
908         : Buffer(static_halide_type(), sizes, storage_order) {
909     }
910 
911     /** Make an Buffer that refers to a statically sized array. Does not
912      * take ownership of the data, and does not set the host_dirty flag. */
913     template<typename Array, size_t N>
914     explicit Buffer(Array (&vals)[N]) {
915         const int buf_dimensions = dimensionality_of_array(vals);
916         buf.type = scalar_type_of_array(vals);
917         buf.host = (uint8_t *)vals;
918         make_shape_storage(buf_dimensions);
919         initialize_shape_from_array_shape(buf.dimensions - 1, vals);
920     }
921 
922     /** Initialize an Buffer of runtime type from a pointer and some
923      * sizes. Assumes dense row-major packing and a min coordinate of
924      * zero. Does not take ownership of the data and does not set the
925      * host_dirty flag. */
926     template<typename... Args,
927              typename = typename std::enable_if<AllInts<Args...>::value>::type>
928     explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&... rest) {
929         if (!T_is_void) {
930             assert(static_halide_type() == t);
931         }
932         int extents[] = {first, (int)rest...};
933         buf.type = t;
934         constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
935         buf.host = (uint8_t *)const_cast<void *>(data);
936         make_shape_storage(buf_dimensions);
937         initialize_shape(extents);
938     }
939 
940     /** Initialize an Buffer from a pointer and some sizes. Assumes
941      * dense row-major packing and a min coordinate of zero. Does not
942      * take ownership of the data and does not set the host_dirty flag. */
943     template<typename... Args,
944              typename = typename std::enable_if<AllInts<Args...>::value>::type>
945     explicit Buffer(T *data, int first, Args &&... rest) {
946         int extents[] = {first, (int)rest...};
947         buf.type = static_halide_type();
948         constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
949         buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
950         make_shape_storage(buf_dimensions);
951         initialize_shape(extents);
952     }
953 
954     /** Initialize an Buffer from a pointer and a vector of
955      * sizes. Assumes dense row-major packing and a min coordinate of
956      * zero. Does not take ownership of the data and does not set the
957      * host_dirty flag. */
958     explicit Buffer(T *data, const std::vector<int> &sizes) {
959         buf.type = static_halide_type();
960         buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
961         make_shape_storage((int)sizes.size());
962         initialize_shape(sizes);
963     }
964 
965     /** Initialize an Buffer of runtime type from a pointer and a
966      * vector of sizes. Assumes dense row-major packing and a min
967      * coordinate of zero. Does not take ownership of the data and
968      * does not set the host_dirty flag. */
969     explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
970         if (!T_is_void) {
971             assert(static_halide_type() == t);
972         }
973         buf.type = t;
974         buf.host = (uint8_t *)const_cast<void *>(data);
975         make_shape_storage((int)sizes.size());
976         initialize_shape(sizes);
977     }
978 
979     /** Initialize an Buffer from a pointer to the min coordinate and
980      * an array describing the shape.  Does not take ownership of the
981      * data, and does not set the host_dirty flag. */
982     explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
983         if (!T_is_void) {
984             assert(static_halide_type() == t);
985         }
986         buf.type = t;
987         buf.host = (uint8_t *)const_cast<void *>(data);
988         make_shape_storage(d);
989         for (int i = 0; i < d; i++) {
990             buf.dim[i] = shape[i];
991         }
992     }
993 
994     /** Initialize a Buffer from a pointer to the min coordinate and
995      * a vector describing the shape.  Does not take ownership of the
996      * data, and does not set the host_dirty flag. */
997     explicit inline Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
998                            const std::vector<halide_dimension_t> &shape)
999         : Buffer(t, data, (int)shape.size(), shape.data()) {
1000     }
1001 
1002     /** Initialize an Buffer from a pointer to the min coordinate and
1003      * an array describing the shape.  Does not take ownership of the
1004      * data and does not set the host_dirty flag. */
1005     explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
1006         buf.type = static_halide_type();
1007         buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1008         make_shape_storage(d);
1009         for (int i = 0; i < d; i++) {
1010             buf.dim[i] = shape[i];
1011         }
1012     }
1013 
1014     /** Initialize a Buffer from a pointer to the min coordinate and
1015      * a vector describing the shape.  Does not take ownership of the
1016      * data, and does not set the host_dirty flag. */
1017     explicit inline Buffer(T *data, const std::vector<halide_dimension_t> &shape)
1018         : Buffer(data, (int)shape.size(), shape.data()) {
1019     }
1020 
1021     /** Destructor. Will release any underlying owned allocation if
1022      * this is the last reference to it. Will assert fail if there are
1023      * weak references to this Buffer outstanding. */
1024     ~Buffer() {
1025         free_shape_storage();
1026         decref();
1027     }
1028 
1029     /** Get a pointer to the raw halide_buffer_t this wraps. */
1030     // @{
1031     halide_buffer_t *raw_buffer() {
1032         return &buf;
1033     }
1034 
1035     const halide_buffer_t *raw_buffer() const {
1036         return &buf;
1037     }
1038     // @}
1039 
1040     /** Provide a cast operator to halide_buffer_t *, so that
1041      * instances can be passed directly to Halide filters. */
1042     operator halide_buffer_t *() {
1043         return &buf;
1044     }
1045 
1046     /** Return a typed reference to this Buffer. Useful for converting
1047      * a reference to a Buffer<void> to a reference to, for example, a
1048      * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1049      * Does a runtime assert if the source buffer type is void. */
1050     template<typename T2, int D2 = D,
1051              typename = typename std::enable_if<(D2 <= D)>::type>
1052     HALIDE_ALWAYS_INLINE
1053         Buffer<T2, D2> &
1054         as() & {
1055         Buffer<T2, D>::assert_can_convert_from(*this);
1056         return *((Buffer<T2, D2> *)this);
1057     }
1058 
1059     /** Return a const typed reference to this Buffer. Useful for
1060      * converting a conference reference to one Buffer type to a const
1061      * reference to another Buffer type. Does a runtime assert if the
1062      * source buffer type is void. */
1063     template<typename T2, int D2 = D,
1064              typename = typename std::enable_if<(D2 <= D)>::type>
1065     HALIDE_ALWAYS_INLINE const Buffer<T2, D2> &as() const & {
1066         Buffer<T2, D>::assert_can_convert_from(*this);
1067         return *((const Buffer<T2, D2> *)this);
1068     }
1069 
1070     /** Returns this rval Buffer with a different type attached. Does
1071      * a dynamic type check if the source type is void. */
1072     template<typename T2, int D2 = D>
1073     HALIDE_ALWAYS_INLINE
1074         Buffer<T2, D2>
1075         as() && {
1076         Buffer<T2, D2>::assert_can_convert_from(*this);
1077         return *((Buffer<T2, D2> *)this);
1078     }
1079 
1080     /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
1081      * to recapitulate the type argument. */
1082     // @{
1083     HALIDE_ALWAYS_INLINE
1084     Buffer<typename std::add_const<T>::type, D> &as_const() & {
1085         // Note that we can skip the assert_can_convert_from(), since T -> const T
1086         // conversion is always legal.
1087         return *((Buffer<typename std::add_const<T>::type> *)this);
1088     }
1089 
1090     HALIDE_ALWAYS_INLINE
1091     const Buffer<typename std::add_const<T>::type, D> &as_const() const & {
1092         return *((const Buffer<typename std::add_const<T>::type> *)this);
1093     }
1094 
1095     HALIDE_ALWAYS_INLINE
1096     Buffer<typename std::add_const<T>::type, D> as_const() && {
1097         return *((Buffer<typename std::add_const<T>::type> *)this);
1098     }
1099     // @}
1100 
1101     /** Conventional names for the first three dimensions. */
1102     // @{
1103     int width() const {
1104         return (dimensions() > 0) ? dim(0).extent() : 1;
1105     }
1106     int height() const {
1107         return (dimensions() > 1) ? dim(1).extent() : 1;
1108     }
1109     int channels() const {
1110         return (dimensions() > 2) ? dim(2).extent() : 1;
1111     }
1112     // @}
1113 
1114     /** Conventional names for the min and max value of each dimension */
1115     // @{
1116     int left() const {
1117         return dim(0).min();
1118     }
1119 
1120     int right() const {
1121         return dim(0).max();
1122     }
1123 
1124     int top() const {
1125         return dim(1).min();
1126     }
1127 
1128     int bottom() const {
1129         return dim(1).max();
1130     }
1131     // @}
1132 
1133     /** Make a new image which is a deep copy of this image. Use crop
1134      * or slice followed by copy to make a copy of only a portion of
1135      * the image. The new image uses the same memory layout as the
1136      * original, with holes compacted away. Note that the returned
1137      * Buffer is always of a non-const type T (ie:
1138      *
1139      *     Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
1140      *
1141      * which is always safe, since we are making a deep copy. (The caller
1142      * can easily cast it back to Buffer<const T> if desired, which is
1143      * always safe and free.)
1144      */
1145     Buffer<not_const_T, D> copy(void *(*allocate_fn)(size_t) = nullptr,
1146                                 void (*deallocate_fn)(void *) = nullptr) const {
1147         Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_with_shape_of(*this, allocate_fn, deallocate_fn);
1148         dst.copy_from(*this);
1149         return dst;
1150     }
1151 
1152     /** Like copy(), but the copy is created in interleaved memory layout
1153      * (vs. keeping the same memory layout as the original). Requires that 'this'
1154      * has exactly 3 dimensions.
1155      */
1156     Buffer<not_const_T, D> copy_to_interleaved(void *(*allocate_fn)(size_t) = nullptr,
1157                                                void (*deallocate_fn)(void *) = nullptr) const {
1158         assert(dimensions() == 3);
1159         Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_interleaved(nullptr, width(), height(), channels());
1160         dst.set_min(min(0), min(1), min(2));
1161         dst.allocate(allocate_fn, deallocate_fn);
1162         dst.copy_from(*this);
1163         return dst;
1164     }
1165 
1166     /** Like copy(), but the copy is created in planar memory layout
1167      * (vs. keeping the same memory layout as the original).
1168      */
1169     Buffer<not_const_T, D> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
1170                                           void (*deallocate_fn)(void *) = nullptr) const {
1171         std::vector<int> mins, extents;
1172         const int dims = dimensions();
1173         mins.reserve(dims);
1174         extents.reserve(dims);
1175         for (int d = 0; d < dims; ++d) {
1176             mins.push_back(dim(d).min());
1177             extents.push_back(dim(d).extent());
1178         }
1179         Buffer<not_const_T, D> dst = Buffer<not_const_T, D>(nullptr, extents);
1180         dst.set_min(mins);
1181         dst.allocate(allocate_fn, deallocate_fn);
1182         dst.copy_from(*this);
1183         return dst;
1184     }
1185 
1186     /** Make a copy of the Buffer which shares the underlying host and/or device
1187      * allocations as the existing Buffer. This is purely syntactic sugar for
1188      * cases where you have a const reference to a Buffer but need a temporary
1189      * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
1190      * inline way to create a temporary. \code
1191      * void call_my_func(const Buffer<const uint8_t>& input) {
1192      *     my_func(input.alias(), output);
1193      * }\endcode
1194      */
1195     inline Buffer<T, D> alias() const {
1196         return *this;
1197     }
1198 
1199     /** Fill a Buffer with the values at the same coordinates in
1200      * another Buffer. Restricts itself to coordinates contained
1201      * within the intersection of the two buffers. If the two Buffers
1202      * are not in the same coordinate system, you will need to
1203      * translate the argument Buffer first. E.g. if you're blitting a
1204      * sprite onto a framebuffer, you'll want to translate the sprite
1205      * to the correct location first like so: \code
1206      * framebuffer.copy_from(sprite.translated({x, y})); \endcode
1207     */
1208     template<typename T2, int D2>
1209     void copy_from(const Buffer<T2, D2> &other) {
1210         static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>");
1211         assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
1212         assert(!other.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
1213 
1214         Buffer<const T, D> src(other);
1215         Buffer<T, D> dst(*this);
1216 
1217         assert(src.dimensions() == dst.dimensions());
1218 
1219         // Trim the copy to the region in common
1220         for (int i = 0; i < dimensions(); i++) {
1221             int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
1222             int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
1223             if (max_coord < min_coord) {
1224                 // The buffers do not overlap.
1225                 return;
1226             }
1227             dst.crop(i, min_coord, max_coord - min_coord + 1);
1228             src.crop(i, min_coord, max_coord - min_coord + 1);
1229         }
1230 
1231         // If T is void, we need to do runtime dispatch to an
1232         // appropriately-typed lambda. We're copying, so we only care
1233         // about the element size. (If not, this should optimize away
1234         // into a static dispatch to the right-sized copy.)
1235         if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
1236             using MemType = uint8_t;
1237             auto &typed_dst = (Buffer<MemType, D> &)dst;
1238             auto &typed_src = (Buffer<const MemType, D> &)src;
1239             typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1240         } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
1241             using MemType = uint16_t;
1242             auto &typed_dst = (Buffer<MemType, D> &)dst;
1243             auto &typed_src = (Buffer<const MemType, D> &)src;
1244             typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1245         } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
1246             using MemType = uint32_t;
1247             auto &typed_dst = (Buffer<MemType, D> &)dst;
1248             auto &typed_src = (Buffer<const MemType, D> &)src;
1249             typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1250         } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
1251             using MemType = uint64_t;
1252             auto &typed_dst = (Buffer<MemType, D> &)dst;
1253             auto &typed_src = (Buffer<const MemType, D> &)src;
1254             typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1255         } else {
1256             assert(false && "type().bytes() must be 1, 2, 4, or 8");
1257         }
1258         set_host_dirty();
1259     }
1260 
1261     /** Make an image that refers to a sub-range of this image along
1262      * the given dimension. Asserts that the crop region is within
1263      * the existing bounds: you cannot "crop outwards", even if you know there
1264      * is valid Buffer storage (e.g. because you already cropped inwards). */
1265     Buffer<T, D> cropped(int d, int min, int extent) const {
1266         // Make a fresh copy of the underlying buffer (but not a fresh
1267         // copy of the allocation, if there is one).
1268         Buffer<T, D> im = *this;
1269 
1270         // This guarantees the prexisting device ref is dropped if the
1271         // device_crop call fails and maintains the buffer in a consistent
1272         // state.
1273         im.device_deallocate();
1274 
1275         im.crop_host(d, min, extent);
1276         if (buf.device_interface != nullptr) {
1277             complete_device_crop(im);
1278         }
1279         return im;
1280     }
1281 
1282     /** Crop an image in-place along the given dimension. This does
1283      * not move any data around in memory - it just changes the min
1284      * and extent of the given dimension. */
1285     void crop(int d, int min, int extent) {
1286         // An optimization for non-device buffers. For the device case,
1287         // a temp buffer is required, so reuse the not-in-place version.
1288         // TODO(zalman|abadams): Are nop crops common enough to special
1289         // case the device part of the if to do nothing?
1290         if (buf.device_interface != nullptr) {
1291             *this = cropped(d, min, extent);
1292         } else {
1293             crop_host(d, min, extent);
1294         }
1295     }
1296 
1297     /** Make an image that refers to a sub-rectangle of this image along
1298      * the first N dimensions. Asserts that the crop region is within
1299      * the existing bounds. The cropped image may drop any device handle
1300      * if the device_interface cannot accomplish the crop in-place. */
1301     Buffer<T, D> cropped(const std::vector<std::pair<int, int>> &rect) const {
1302         // Make a fresh copy of the underlying buffer (but not a fresh
1303         // copy of the allocation, if there is one).
1304         Buffer<T, D> im = *this;
1305 
1306         // This guarantees the prexisting device ref is dropped if the
1307         // device_crop call fails and maintains the buffer in a consistent
1308         // state.
1309         im.device_deallocate();
1310 
1311         im.crop_host(rect);
1312         if (buf.device_interface != nullptr) {
1313             complete_device_crop(im);
1314         }
1315         return im;
1316     }
1317 
1318     /** Crop an image in-place along the first N dimensions. This does
1319      * not move any data around in memory, nor does it free memory. It
1320      * just rewrites the min/extent of each dimension to refer to a
1321      * subregion of the same allocation. */
1322     void crop(const std::vector<std::pair<int, int>> &rect) {
1323         // An optimization for non-device buffers. For the device case,
1324         // a temp buffer is required, so reuse the not-in-place version.
1325         // TODO(zalman|abadams): Are nop crops common enough to special
1326         // case the device part of the if to do nothing?
1327         if (buf.device_interface != nullptr) {
1328             *this = cropped(rect);
1329         } else {
1330             crop_host(rect);
1331         }
1332     }
1333 
1334     /** Make an image which refers to the same data with using
1335      * translated coordinates in the given dimension. Positive values
1336      * move the image data to the right or down relative to the
1337      * coordinate system. Drops any device handle. */
1338     Buffer<T, D> translated(int d, int dx) const {
1339         Buffer<T, D> im = *this;
1340         im.translate(d, dx);
1341         return im;
1342     }
1343 
1344     /** Translate an image in-place along one dimension by changing
1345      * how it is indexed. Does not move any data around in memory. */
1346     void translate(int d, int delta) {
1347         assert(d >= 0 && d < this->dimensions());
1348         device_deallocate();
1349         buf.dim[d].min += delta;
1350     }
1351 
1352     /** Make an image which refers to the same data translated along
1353      * the first N dimensions. */
1354     Buffer<T, D> translated(const std::vector<int> &delta) const {
1355         Buffer<T, D> im = *this;
1356         im.translate(delta);
1357         return im;
1358     }
1359 
1360     /** Translate an image along the first N dimensions by changing
1361      * how it is indexed. Does not move any data around in memory. */
1362     void translate(const std::vector<int> &delta) {
1363         device_deallocate();
1364         assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
1365         int limit = (int)delta.size();
1366         assert(limit <= dimensions());
1367         for (int i = 0; i < limit; i++) {
1368             translate(i, delta[i]);
1369         }
1370     }
1371 
1372     /** Set the min coordinate of an image in the first N dimensions. */
1373     // @{
1374     void set_min(const std::vector<int> &mins) {
1375         assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
1376         device_deallocate();
1377         for (size_t i = 0; i < mins.size(); i++) {
1378             buf.dim[i].min = mins[i];
1379         }
1380     }
1381 
1382     template<typename... Args>
1383     void set_min(Args... args) {
1384         set_min(std::vector<int>{args...});
1385     }
1386     // @}
1387 
1388     /** Test if a given coordinate is within the bounds of an image. */
1389     // @{
1390     bool contains(const std::vector<int> &coords) const {
1391         assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
1392         for (size_t i = 0; i < coords.size(); i++) {
1393             if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
1394                 return false;
1395             }
1396         }
1397         return true;
1398     }
1399 
1400     template<typename... Args>
1401     bool contains(Args... args) const {
1402         return contains(std::vector<int>{args...});
1403     }
1404     // @}
1405 
1406     /** Make a buffer which refers to the same data in the same layout
1407      * using a swapped indexing order for the dimensions given. So
1408      * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
1409      * strongly that A.address_of(i, j) == B.address_of(j, i). */
1410     Buffer<T, D> transposed(int d1, int d2) const {
1411         Buffer<T, D> im = *this;
1412         im.transpose(d1, d2);
1413         return im;
1414     }
1415 
1416     /** Transpose a buffer in-place by changing how it is indexed. For
1417      * example, transpose(0, 1) on a two-dimensional buffer means that
1418      * the value referred to by coordinates (i, j) is now reached at
1419      * the coordinates (j, i), and vice versa. This is done by
1420      * reordering the per-dimension metadata rather than by moving
1421      * data around in memory, so other views of the same memory will
1422      * not see the data as having been transposed. */
1423     void transpose(int d1, int d2) {
1424         assert(d1 >= 0 && d1 < this->dimensions());
1425         assert(d2 >= 0 && d2 < this->dimensions());
1426         std::swap(buf.dim[d1], buf.dim[d2]);
1427     }
1428 
1429     /** A generalized transpose: instead of swapping two dimensions,
1430      * pass a vector that lists each dimension index exactly once, in
1431      * the desired order. This does not move any data around in memory
1432      * - it just permutes how it is indexed. */
1433     void transpose(const std::vector<int> &order) {
1434         assert((int)order.size() == dimensions());
1435         if (dimensions() < 2) {
1436             // My, that was easy
1437             return;
1438         }
1439 
1440         std::vector<int> order_sorted = order;
1441         for (size_t i = 1; i < order_sorted.size(); i++) {
1442             for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
1443                 std::swap(order_sorted[j], order_sorted[j - 1]);
1444                 transpose(j, j - 1);
1445             }
1446         }
1447     }
1448 
1449     /** Make a buffer which refers to the same data in the same
1450      * layout using a different ordering of the dimensions. */
1451     Buffer<T, D> transposed(const std::vector<int> &order) const {
1452         Buffer<T, D> im = *this;
1453         im.transpose(order);
1454         return im;
1455     }
1456 
1457     /** Make a lower-dimensional buffer that refers to one slice of
1458      * this buffer. */
1459     Buffer<T, D> sliced(int d, int pos) const {
1460         Buffer<T, D> im = *this;
1461 
1462         // This guarantees the prexisting device ref is dropped if the
1463         // device_slice call fails and maintains the buffer in a consistent
1464         // state.
1465         im.device_deallocate();
1466 
1467         im.slice_host(d, pos);
1468         if (buf.device_interface != nullptr) {
1469             complete_device_slice(im, d, pos);
1470         }
1471         return im;
1472     }
1473 
1474     /** Make a lower-dimensional buffer that refers to one slice of this
1475      * buffer at the dimension's minimum. */
1476     inline Buffer<T, D> sliced(int d) const {
1477         return sliced(d, dim(d).min());
1478     }
1479 
1480     /** Rewrite the buffer to refer to a single lower-dimensional
1481      * slice of itself along the given dimension at the given
1482      * coordinate. Does not move any data around or free the original
1483      * memory, so other views of the same data are unaffected. */
1484     void slice(int d, int pos) {
1485         // An optimization for non-device buffers. For the device case,
1486         // a temp buffer is required, so reuse the not-in-place version.
1487         // TODO(zalman|abadams): Are nop slices common enough to special
1488         // case the device part of the if to do nothing?
1489         if (buf.device_interface != nullptr) {
1490             *this = sliced(d, pos);
1491         } else {
1492             slice_host(d, pos);
1493         }
1494     }
1495 
1496     /** Slice a buffer in-place at the dimension's minimum. */
1497     inline void slice(int d) {
1498         slice(d, dim(d).min());
1499     }
1500 
1501     /** Make a new buffer that views this buffer as a single slice in a
1502      * higher-dimensional space. The new dimension has extent one and
1503      * the given min. This operation is the opposite of slice. As an
1504      * example, the following condition is true:
1505      *
1506      \code
1507      im2 = im.embedded(1, 17);
1508      &im(x, y, c) == &im2(x, 17, y, c);
1509      \endcode
1510      */
1511     Buffer<T, D> embedded(int d, int pos = 0) const {
1512         Buffer<T, D> im(*this);
1513         im.embed(d, pos);
1514         return im;
1515     }
1516 
1517     /** Embed a buffer in-place, increasing the
1518      * dimensionality. */
1519     void embed(int d, int pos = 0) {
1520         assert(d >= 0 && d <= dimensions());
1521         add_dimension();
1522         translate(dimensions() - 1, pos);
1523         for (int i = dimensions() - 1; i > d; i--) {
1524             transpose(i, i - 1);
1525         }
1526     }
1527 
1528     /** Add a new dimension with a min of zero and an extent of
1529      * one. The stride is the extent of the outermost dimension times
1530      * its stride. The new dimension is the last dimension. This is a
1531      * special case of embed. */
1532     void add_dimension() {
1533         const int dims = buf.dimensions;
1534         buf.dimensions++;
1535         if (buf.dim != shape) {
1536             // We're already on the heap. Reallocate.
1537             halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
1538             for (int i = 0; i < dims; i++) {
1539                 new_shape[i] = buf.dim[i];
1540             }
1541             delete[] buf.dim;
1542             buf.dim = new_shape;
1543         } else if (dims == D) {
1544             // Transition from the in-class storage to the heap
1545             make_shape_storage(buf.dimensions);
1546             for (int i = 0; i < dims; i++) {
1547                 buf.dim[i] = shape[i];
1548             }
1549         } else {
1550             // We still fit in the class
1551         }
1552         buf.dim[dims] = {0, 1, 0};
1553         if (dims == 0) {
1554             buf.dim[dims].stride = 1;
1555         } else {
1556             buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
1557         }
1558     }
1559 
1560     /** Add a new dimension with a min of zero, an extent of one, and
1561      * the specified stride. The new dimension is the last
1562      * dimension. This is a special case of embed. */
1563     void add_dimension_with_stride(int s) {
1564         add_dimension();
1565         buf.dim[buf.dimensions - 1].stride = s;
1566     }
1567 
1568     /** Methods for managing any GPU allocation. */
1569     // @{
1570     // Set the host dirty flag. Called by every operator()
1571     // access. Must be inlined so it can be hoisted out of loops.
1572     HALIDE_ALWAYS_INLINE
1573     void set_host_dirty(bool v = true) {
1574         assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
1575         buf.set_host_dirty(v);
1576     }
1577 
1578     // Check if the device allocation is dirty. Called by
1579     // set_host_dirty, which is called by every accessor. Must be
1580     // inlined so it can be hoisted out of loops.
1581     HALIDE_ALWAYS_INLINE
1582     bool device_dirty() const {
1583         return buf.device_dirty();
1584     }
1585 
1586     bool host_dirty() const {
1587         return buf.host_dirty();
1588     }
1589 
1590     void set_device_dirty(bool v = true) {
1591         assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
1592         buf.set_device_dirty(v);
1593     }
1594 
1595     int copy_to_host(void *ctx = nullptr) {
1596         if (device_dirty()) {
1597             return buf.device_interface->copy_to_host(ctx, &buf);
1598         }
1599         return 0;
1600     }
1601 
1602     int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1603         if (host_dirty()) {
1604             return device_interface->copy_to_device(ctx, &buf, device_interface);
1605         }
1606         return 0;
1607     }
1608 
1609     int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1610         return device_interface->device_malloc(ctx, &buf, device_interface);
1611     }
1612 
1613     int device_free(void *ctx = nullptr) {
1614         if (dev_ref_count) {
1615             assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
1616                    "Can't call device_free on an unmanaged or wrapped native device handle. "
1617                    "Free the source allocation or call device_detach_native instead.");
1618             // Multiple people may be holding onto this dev field
1619             assert(dev_ref_count->count == 1 &&
1620                    "Multiple Halide::Runtime::Buffer objects share this device "
1621                    "allocation. Freeing it would create dangling references. "
1622                    "Don't call device_free on Halide buffers that you have copied or "
1623                    "passed by value.");
1624         }
1625         int ret = 0;
1626         if (buf.device_interface) {
1627             ret = buf.device_interface->device_free(ctx, &buf);
1628         }
1629         if (dev_ref_count) {
1630             delete dev_ref_count;
1631             dev_ref_count = nullptr;
1632         }
1633         return ret;
1634     }
1635 
1636     int device_wrap_native(const struct halide_device_interface_t *device_interface,
1637                            uint64_t handle, void *ctx = nullptr) {
1638         assert(device_interface);
1639         dev_ref_count = new DeviceRefCount;
1640         dev_ref_count->ownership = BufferDeviceOwnership::WrappedNative;
1641         return device_interface->wrap_native(ctx, &buf, handle, device_interface);
1642     }
1643 
1644     int device_detach_native(void *ctx = nullptr) {
1645         assert(dev_ref_count &&
1646                dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative &&
1647                "Only call device_detach_native on buffers wrapping a native "
1648                "device handle via device_wrap_native. This buffer was allocated "
1649                "using device_malloc, or is unmanaged. "
1650                "Call device_free or free the original allocation instead.");
1651         // Multiple people may be holding onto this dev field
1652         assert(dev_ref_count->count == 1 &&
1653                "Multiple Halide::Runtime::Buffer objects share this device "
1654                "allocation. Freeing it could create dangling references. "
1655                "Don't call device_detach_native on Halide buffers that you "
1656                "have copied or passed by value.");
1657         int ret = 0;
1658         if (buf.device_interface) {
1659             ret = buf.device_interface->detach_native(ctx, &buf);
1660         }
1661         delete dev_ref_count;
1662         dev_ref_count = nullptr;
1663         return ret;
1664     }
1665 
1666     int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1667         return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
1668     }
1669 
1670     int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1671         if (dev_ref_count) {
1672             assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost &&
1673                    "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
1674                    "Free the source allocation or call device_detach_native instead.");
1675             // Multiple people may be holding onto this dev field
1676             assert(dev_ref_count->count == 1 &&
1677                    "Multiple Halide::Runtime::Buffer objects share this device "
1678                    "allocation. Freeing it would create dangling references. "
1679                    "Don't call device_and_host_free on Halide buffers that you have copied or "
1680                    "passed by value.");
1681         }
1682         int ret = 0;
1683         if (buf.device_interface) {
1684             ret = buf.device_interface->device_and_host_free(ctx, &buf);
1685         }
1686         if (dev_ref_count) {
1687             delete dev_ref_count;
1688             dev_ref_count = nullptr;
1689         }
1690         return ret;
1691     }
1692 
1693     int device_sync(void *ctx = nullptr) {
1694         if (buf.device_interface) {
1695             return buf.device_interface->device_sync(ctx, &buf);
1696         } else {
1697             return 0;
1698         }
1699     }
1700 
1701     bool has_device_allocation() const {
1702         return buf.device != 0;
1703     }
1704 
1705     /** Return the method by which the device field is managed. */
1706     BufferDeviceOwnership device_ownership() const {
1707         if (dev_ref_count == nullptr) {
1708             return BufferDeviceOwnership::Allocated;
1709         }
1710         return dev_ref_count->ownership;
1711     }
1712     // @}
1713 
1714     /** If you use the (x, y, c) indexing convention, then Halide
1715      * Buffers are stored planar by default. This function constructs
1716      * an interleaved RGB or RGBA image that can still be indexed
1717      * using (x, y, c). Passing it to a generator requires that the
1718      * generator has been compiled with support for interleaved (also
1719      * known as packed or chunky) memory layouts. */
1720     static Buffer<void, D> make_interleaved(halide_type_t t, int width, int height, int channels) {
1721         Buffer<void, D> im(t, channels, width, height);
1722         // Note that this is equivalent to calling transpose({2, 0, 1}),
1723         // but slightly more efficient.
1724         im.transpose(0, 1);
1725         im.transpose(1, 2);
1726         return im;
1727     }
1728 
1729     /** If you use the (x, y, c) indexing convention, then Halide
1730      * Buffers are stored planar by default. This function constructs
1731      * an interleaved RGB or RGBA image that can still be indexed
1732      * using (x, y, c). Passing it to a generator requires that the
1733      * generator has been compiled with support for interleaved (also
1734      * known as packed or chunky) memory layouts. */
1735     static Buffer<T, D> make_interleaved(int width, int height, int channels) {
1736         return make_interleaved(static_halide_type(), width, height, channels);
1737     }
1738 
1739     /** Wrap an existing interleaved image. */
1740     static Buffer<add_const_if_T_is_const<void>, D>
1741     make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
1742         Buffer<add_const_if_T_is_const<void>, D> im(t, data, channels, width, height);
1743         im.transpose(0, 1);
1744         im.transpose(1, 2);
1745         return im;
1746     }
1747 
1748     /** Wrap an existing interleaved image. */
1749     static Buffer<T, D> make_interleaved(T *data, int width, int height, int channels) {
1750         return make_interleaved(static_halide_type(), data, width, height, channels);
1751     }
1752 
1753     /** Make a zero-dimensional Buffer */
1754     static Buffer<add_const_if_T_is_const<void>, D> make_scalar(halide_type_t t) {
1755         Buffer<add_const_if_T_is_const<void>, 1> buf(t, 1);
1756         buf.slice(0, 0);
1757         return buf;
1758     }
1759 
1760     /** Make a zero-dimensional Buffer */
1761     static Buffer<T, D> make_scalar() {
1762         Buffer<T, 1> buf(1);
1763         buf.slice(0, 0);
1764         return buf;
1765     }
1766 
1767     /** Make a zero-dimensional Buffer that points to non-owned, existing data */
1768     static Buffer<T, D> make_scalar(T *data) {
1769         Buffer<T, 1> buf(data, 1);
1770         buf.slice(0, 0);
1771         return buf;
1772     }
1773 
1774     /** Make a buffer with the same shape and memory nesting order as
1775      * another buffer. It may have a different type. */
1776     template<typename T2, int D2>
1777     static Buffer<T, D> make_with_shape_of(Buffer<T2, D2> src,
1778                                            void *(*allocate_fn)(size_t) = nullptr,
1779                                            void (*deallocate_fn)(void *) = nullptr) {
1780 
1781         const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>();
1782         return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
1783                                                    allocate_fn, deallocate_fn);
1784     }
1785 
1786 private:
1787     static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
1788                                               int dimensions,
1789                                               halide_dimension_t *shape,
1790                                               void *(*allocate_fn)(size_t),
1791                                               void (*deallocate_fn)(void *)) {
1792         // Reorder the dimensions of src to have strides in increasing order
1793         std::vector<int> swaps;
1794         for (int i = dimensions - 1; i > 0; i--) {
1795             for (int j = i; j > 0; j--) {
1796                 if (shape[j - 1].stride > shape[j].stride) {
1797                     std::swap(shape[j - 1], shape[j]);
1798                     swaps.push_back(j);
1799                 }
1800             }
1801         }
1802 
1803         // Rewrite the strides to be dense (this messes up src, which
1804         // is why we took it by value).
1805         for (int i = 0; i < dimensions; i++) {
1806             if (i == 0) {
1807                 shape[i].stride = 1;
1808             } else {
1809                 shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
1810             }
1811         }
1812 
1813         // Undo the dimension reordering
1814         while (!swaps.empty()) {
1815             int j = swaps.back();
1816             std::swap(shape[j - 1], shape[j]);
1817             swaps.pop_back();
1818         }
1819 
1820         // Use an explicit runtime type, and make dst a Buffer<void>, to allow
1821         // using this method with Buffer<void> for either src or dst.
1822         Buffer<> dst(dst_type, nullptr, dimensions, shape);
1823         dst.allocate(allocate_fn, deallocate_fn);
1824 
1825         return dst;
1826     }
1827 
1828     template<typename... Args>
1829     HALIDE_ALWAYS_INLINE
1830         ptrdiff_t
1831         offset_of(int d, int first, Args... rest) const {
1832         return offset_of(d + 1, rest...) + this->buf.dim[d].stride * (first - this->buf.dim[d].min);
1833     }
1834 
1835     HALIDE_ALWAYS_INLINE
1836     ptrdiff_t offset_of(int d) const {
1837         return 0;
1838     }
1839 
1840     template<typename... Args>
1841     HALIDE_ALWAYS_INLINE
1842         storage_T *
1843         address_of(Args... args) const {
1844         if (T_is_void) {
1845             return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
1846         } else {
1847             return (storage_T *)(this->buf.host) + offset_of(0, args...);
1848         }
1849     }
1850 
1851     HALIDE_ALWAYS_INLINE
1852     ptrdiff_t offset_of(const int *pos) const {
1853         ptrdiff_t offset = 0;
1854         for (int i = this->dimensions() - 1; i >= 0; i--) {
1855             offset += this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
1856         }
1857         return offset;
1858     }
1859 
1860     HALIDE_ALWAYS_INLINE
1861     storage_T *address_of(const int *pos) const {
1862         if (T_is_void) {
1863             return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
1864         } else {
1865             return (storage_T *)this->buf.host + offset_of(pos);
1866         }
1867     }
1868 
1869 public:
1870     /** Get a pointer to the address of the min coordinate. */
1871     T *data() const {
1872         return (T *)(this->buf.host);
1873     }
1874 
1875     /** Access elements. Use im(...) to get a reference to an element,
1876      * and use &im(...) to get the address of an element. If you pass
1877      * fewer arguments than the buffer has dimensions, the rest are
1878      * treated as their min coordinate. The non-const versions set the
1879      * host_dirty flag to true.
1880      */
1881     //@{
1882     template<typename... Args,
1883              typename = typename std::enable_if<AllInts<Args...>::value>::type>
1884     HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
1885         static_assert(!T_is_void,
1886                       "Cannot use operator() on Buffer<void> types");
1887         assert(!device_dirty());
1888         return *((const not_void_T *)(address_of(first, rest...)));
1889     }
1890 
1891     HALIDE_ALWAYS_INLINE
1892     const not_void_T &
1893     operator()() const {
1894         static_assert(!T_is_void,
1895                       "Cannot use operator() on Buffer<void> types");
1896         assert(!device_dirty());
1897         return *((const not_void_T *)(data()));
1898     }
1899 
1900     HALIDE_ALWAYS_INLINE
1901     const not_void_T &
1902     operator()(const int *pos) const {
1903         static_assert(!T_is_void,
1904                       "Cannot use operator() on Buffer<void> types");
1905         assert(!device_dirty());
1906         return *((const not_void_T *)(address_of(pos)));
1907     }
1908 
1909     template<typename... Args,
1910              typename = typename std::enable_if<AllInts<Args...>::value>::type>
1911     HALIDE_ALWAYS_INLINE
1912         not_void_T &
1913         operator()(int first, Args... rest) {
1914         static_assert(!T_is_void,
1915                       "Cannot use operator() on Buffer<void> types");
1916         set_host_dirty();
1917         return *((not_void_T *)(address_of(first, rest...)));
1918     }
1919 
1920     HALIDE_ALWAYS_INLINE
1921     not_void_T &
1922     operator()() {
1923         static_assert(!T_is_void,
1924                       "Cannot use operator() on Buffer<void> types");
1925         set_host_dirty();
1926         return *((not_void_T *)(data()));
1927     }
1928 
1929     HALIDE_ALWAYS_INLINE
1930     not_void_T &
1931     operator()(const int *pos) {
1932         static_assert(!T_is_void,
1933                       "Cannot use operator() on Buffer<void> types");
1934         set_host_dirty();
1935         return *((not_void_T *)(address_of(pos)));
1936     }
1937     // @}
1938 
1939     /** Tests that all values in this buffer are equal to val. */
1940     bool all_equal(not_void_T val) const {
1941         bool all_equal = true;
1942         for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
1943         return all_equal;
1944     }
1945 
1946     Buffer<T, D> &fill(not_void_T val) {
1947         set_host_dirty();
1948         for_each_value([=](T &v) { v = val; });
1949         return *this;
1950     }
1951 
1952 private:
1953     /** Helper functions for for_each_value. */
1954     // @{
1955     template<int N>
1956     struct for_each_value_task_dim {
1957         int extent;
1958         int stride[N];
1959     };
1960 
1961     // Given an array of strides, and a bunch of pointers to pointers
1962     // (all of different types), advance the pointers using the
1963     // strides.
1964     template<typename Ptr, typename... Ptrs>
1965     HALIDE_ALWAYS_INLINE static void advance_ptrs(const int *stride, Ptr *ptr, Ptrs... ptrs) {
1966         (*ptr) += *stride;
1967         advance_ptrs(stride + 1, ptrs...);
1968     }
1969 
1970     HALIDE_ALWAYS_INLINE
1971     static void advance_ptrs(const int *) {
1972     }
1973 
1974     // Same as the above, but just increments the pointers.
1975     template<typename Ptr, typename... Ptrs>
1976     HALIDE_ALWAYS_INLINE static void increment_ptrs(Ptr *ptr, Ptrs... ptrs) {
1977         (*ptr)++;
1978         increment_ptrs(ptrs...);
1979     }
1980 
1981     HALIDE_ALWAYS_INLINE
1982     static void increment_ptrs() {
1983     }
1984 
1985     template<typename Fn, typename... Ptrs>
1986     HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
1987                                                           const for_each_value_task_dim<sizeof...(Ptrs)> *t, Ptrs... ptrs) {
1988         if (d == -1) {
1989             f((*ptrs)...);
1990         } else if (d == 0) {
1991             if (innermost_strides_are_one) {
1992                 for (int i = t[0].extent; i != 0; i--) {
1993                     f((*ptrs)...);
1994                     increment_ptrs((&ptrs)...);
1995                 }
1996             } else {
1997                 for (int i = t[0].extent; i != 0; i--) {
1998                     f((*ptrs)...);
1999                     advance_ptrs(t[0].stride, (&ptrs)...);
2000                 }
2001             }
2002         } else {
2003             for (int i = t[d].extent; i != 0; i--) {
2004                 for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptrs...);
2005                 advance_ptrs(t[d].stride, (&ptrs)...);
2006             }
2007         }
2008     }
2009 
2010     template<int N>
2011     HALIDE_NEVER_INLINE static bool for_each_value_prep(for_each_value_task_dim<N> *t,
2012                                                         const halide_buffer_t **buffers) {
2013         // Check the buffers all have clean host allocations
2014         for (int i = 0; i < N; i++) {
2015             if (buffers[i]->device) {
2016                 assert(buffers[i]->host &&
2017                        "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
2018                 assert(!buffers[i]->device_dirty() &&
2019                        "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
2020             } else {
2021                 assert(buffers[i]->host &&
2022                        "Buffer passed to for_each_value has no host or device allocation");
2023             }
2024         }
2025 
2026         const int dimensions = buffers[0]->dimensions;
2027 
2028         // Extract the strides in all the dimensions
2029         for (int i = 0; i < dimensions; i++) {
2030             for (int j = 0; j < N; j++) {
2031                 assert(buffers[j]->dimensions == dimensions);
2032                 assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
2033                        buffers[j]->dim[i].min == buffers[0]->dim[i].min);
2034                 const int s = buffers[j]->dim[i].stride;
2035                 t[i].stride[j] = s;
2036             }
2037             t[i].extent = buffers[0]->dim[i].extent;
2038 
2039             // Order the dimensions by stride, so that the traversal is cache-coherent.
2040             for (int j = i; j > 0 && t[j].stride[0] < t[j - 1].stride[0]; j--) {
2041                 std::swap(t[j], t[j - 1]);
2042             }
2043         }
2044 
2045         // flatten dimensions where possible to make a larger inner
2046         // loop for autovectorization.
2047         int d = dimensions;
2048         for (int i = 1; i < d; i++) {
2049             bool flat = true;
2050             for (int j = 0; j < N; j++) {
2051                 flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
2052             }
2053             if (flat) {
2054                 t[i - 1].extent *= t[i].extent;
2055                 for (int j = i; j < d; j++) {
2056                     t[j] = t[j + 1];
2057                 }
2058                 i--;
2059                 d--;
2060                 t[d].extent = 1;
2061             }
2062         }
2063 
2064         bool innermost_strides_are_one = true;
2065         if (dimensions > 0) {
2066             for (int i = 0; i < N; i++) {
2067                 innermost_strides_are_one &= (t[0].stride[i] == 1);
2068             }
2069         }
2070 
2071         return innermost_strides_are_one;
2072     }
2073 
2074     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2075     void for_each_value_impl(Fn &&f, Args &&... other_buffers) const {
2076         Buffer<>::for_each_value_task_dim<N> *t =
2077             (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA((dimensions() + 1) * sizeof(for_each_value_task_dim<N>));
2078         // Move the preparatory code into a non-templated helper to
2079         // save code size.
2080         const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
2081         bool innermost_strides_are_one = Buffer<>::for_each_value_prep(t, buffers);
2082 
2083         Buffer<>::for_each_value_helper(f, dimensions() - 1,
2084                                         innermost_strides_are_one,
2085                                         t,
2086                                         data(), (other_buffers.data())...);
2087     }
2088     // @}
2089 
2090 public:
2091     /** Call a function on every value in the buffer, and the
2092      * corresponding values in some number of other buffers of the
2093      * same size. The function should take a reference, const
2094      * reference, or value of the correct type for each buffer. This
2095      * effectively lifts a function of scalars to an element-wise
2096      * function of buffers. This produces code that the compiler can
2097      * autovectorize. This is slightly cheaper than for_each_element,
2098      * because it does not need to track the coordinates.
2099      *
2100      * Note that constness of Buffers is preserved: a const Buffer<T> (for either
2101      * 'this' or the other-buffers arguments) will allow mutation of the
2102      * buffer contents, while a Buffer<const T> will not. Attempting to specify
2103      * a mutable reference for the lambda argument of a Buffer<const T>
2104      * will result in a compilation error. */
2105     // @{
2106     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2107     HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_value(Fn &&f, Args &&... other_buffers) const {
2108         for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2109         return *this;
2110     }
2111 
2112     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2113     HALIDE_ALWAYS_INLINE
2114         Buffer<T, D> &
2115         for_each_value(Fn &&f, Args &&... other_buffers) {
2116         for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2117         return *this;
2118     }
2119     // @}
2120 
2121 private:
2122     // Helper functions for for_each_element
2123     struct for_each_element_task_dim {
2124         int min, max;
2125     };
2126 
2127     /** If f is callable with this many args, call it. The first
2128      * argument is just to make the overloads distinct. Actual
2129      * overload selection is done using the enable_if. */
2130     template<typename Fn,
2131              typename... Args,
2132              typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2133     HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
2134         f(args...);
2135     }
2136 
2137     /** If the above overload is impossible, we add an outer loop over
2138      * an additional argument and try again. */
2139     template<typename Fn,
2140              typename... Args>
2141     HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
2142         for (int i = t[d].min; i <= t[d].max; i++) {
2143             for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
2144         }
2145     }
2146 
2147     /** Determine the minimum number of arguments a callable can take
2148      * using the same trick. */
2149     template<typename Fn,
2150              typename... Args,
2151              typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2152     HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
2153         return (int)(sizeof...(Args));
2154     }
2155 
2156     /** The recursive version is only enabled up to a recursion limit
2157      * of 256. This catches callables that aren't callable with any
2158      * number of ints. */
2159     template<typename Fn,
2160              typename... Args>
2161     HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
2162         static_assert(sizeof...(args) <= 256,
2163                       "Callable passed to for_each_element must accept either a const int *,"
2164                       " or up to 256 ints. No such operator found. Expect infinite template recursion.");
2165         return num_args(0, std::forward<Fn>(f), 0, args...);
2166     }
2167 
2168     /** A version where the callable takes a position array instead,
2169      * with compile-time recursion on the dimensionality.  This
2170      * overload is preferred to the one below using the same int vs
2171      * double trick as above, but is impossible once d hits -1 using
2172      * std::enable_if. */
2173     template<int d,
2174              typename Fn,
2175              typename = typename std::enable_if<(d >= 0)>::type>
2176     HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2177         for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2178             for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos);
2179         }
2180     }
2181 
2182     /** Base case for recursion above. */
2183     template<int d,
2184              typename Fn,
2185              typename = typename std::enable_if<(d < 0)>::type>
2186     HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2187         f(pos);
2188     }
2189 
2190     /** A run-time-recursive version (instead of
2191      * compile-time-recursive) that requires the callable to take a
2192      * pointer to a position array instead. Dispatches to the
2193      * compile-time-recursive version once the dimensionality gets
2194      * small. */
2195     template<typename Fn>
2196     static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2197         if (d == -1) {
2198             f(pos);
2199         } else if (d == 0) {
2200             // Once the dimensionality gets small enough, dispatch to
2201             // a compile-time-recursive version for better codegen of
2202             // the inner loops.
2203             for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
2204         } else if (d == 1) {
2205             for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
2206         } else if (d == 2) {
2207             for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
2208         } else if (d == 3) {
2209             for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
2210         } else {
2211             for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2212                 for_each_element_array(d - 1, t, std::forward<Fn>(f), pos);
2213             }
2214         }
2215     }
2216 
2217     /** We now have two overloads for for_each_element. This one
2218      * triggers if the callable takes a const int *.
2219      */
2220     template<typename Fn,
2221              typename = decltype(std::declval<Fn>()((const int *)nullptr))>
2222     static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
2223         int *pos = (int *)HALIDE_ALLOCA(dims * sizeof(int));
2224         for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
2225     }
2226 
2227     /** This one triggers otherwise. It treats the callable as
2228      * something that takes some number of ints. */
2229     template<typename Fn>
2230     HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
2231         int args = num_args(0, std::forward<Fn>(f));
2232         assert(dims >= args);
2233         for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
2234     }
2235 
2236     template<typename Fn>
2237     void for_each_element_impl(Fn &&f) const {
2238         for_each_element_task_dim *t =
2239             (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
2240         for (int i = 0; i < dimensions(); i++) {
2241             t[i].min = dim(i).min();
2242             t[i].max = dim(i).max();
2243         }
2244         for_each_element(0, dimensions(), t, std::forward<Fn>(f));
2245     }
2246 
2247 public:
2248     /** Call a function at each site in a buffer. This is likely to be
2249      * much slower than using Halide code to populate a buffer, but is
2250      * convenient for tests. If the function has more arguments than the
2251      * buffer has dimensions, the remaining arguments will be zero. If it
2252      * has fewer arguments than the buffer has dimensions then the last
2253      * few dimensions of the buffer are not iterated over. For example,
2254      * the following code exploits this to set a floating point RGB image
2255      * to red:
2256 
2257      \code
2258      Buffer<float, 3> im(100, 100, 3);
2259      im.for_each_element([&](int x, int y) {
2260          im(x, y, 0) = 1.0f;
2261          im(x, y, 1) = 0.0f;
2262          im(x, y, 2) = 0.0f:
2263      });
2264      \endcode
2265 
2266      * The compiled code is equivalent to writing the a nested for loop,
2267      * and compilers are capable of optimizing it in the same way.
2268      *
2269      * If the callable can be called with an int * as the sole argument,
2270      * that version is called instead. Each location in the buffer is
2271      * passed to it in a coordinate array. This version is higher-overhead
2272      * than the variadic version, but is useful for writing generic code
2273      * that accepts buffers of arbitrary dimensionality. For example, the
2274      * following sets the value at all sites in an arbitrary-dimensional
2275      * buffer to their first coordinate:
2276 
2277      \code
2278      im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
2279      \endcode
2280 
2281      * It is also possible to use for_each_element to iterate over entire
2282      * rows or columns by cropping the buffer to a single column or row
2283      * respectively and iterating over elements of the result. For example,
2284      * to set the diagonal of the image to 1 by iterating over the columns:
2285 
2286      \code
2287      Buffer<float, 3> im(100, 100, 3);
2288          im.sliced(1, 0).for_each_element([&](int x, int c) {
2289          im(x, x, c) = 1.0f;
2290      });
2291      \endcode
2292 
2293      * Or, assuming the memory layout is known to be dense per row, one can
2294      * memset each row of an image like so:
2295 
2296      \code
2297      Buffer<float, 3> im(100, 100, 3);
2298      im.sliced(0, 0).for_each_element([&](int y, int c) {
2299          memset(&im(0, y, c), 0, sizeof(float) * im.width());
2300      });
2301      \endcode
2302 
2303     */
2304     // @{
2305     template<typename Fn>
2306     HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_element(Fn &&f) const {
2307         for_each_element_impl(f);
2308         return *this;
2309     }
2310 
2311     template<typename Fn>
2312     HALIDE_ALWAYS_INLINE
2313         Buffer<T, D> &
2314         for_each_element(Fn &&f) {
2315         for_each_element_impl(f);
2316         return *this;
2317     }
2318     // @}
2319 
2320 private:
2321     template<typename Fn>
2322     struct FillHelper {
2323         Fn f;
2324         Buffer<T, D> *buf;
2325 
2326         template<typename... Args,
2327                  typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2328         void operator()(Args... args) {
2329             (*buf)(args...) = f(args...);
2330         }
2331 
2332         FillHelper(Fn &&f, Buffer<T, D> *buf)
2333             : f(std::forward<Fn>(f)), buf(buf) {
2334         }
2335     };
2336 
2337 public:
2338     /** Fill a buffer by evaluating a callable at every site. The
2339      * callable should look much like a callable passed to
2340      * for_each_element, but it should return the value that should be
2341      * stored to the coordinate corresponding to the arguments. */
2342     template<typename Fn,
2343              typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type>
2344     Buffer<T, D> &fill(Fn &&f) {
2345         // We'll go via for_each_element. We need a variadic wrapper lambda.
2346         FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
2347         return for_each_element(wrapper);
2348     }
2349 
2350     /** Check if an input buffer passed extern stage is a querying
2351      * bounds. Compared to doing the host pointer check directly,
2352      * this both adds clarity to code and will facilitate moving to
2353      * another representation for bounds query arguments. */
2354     bool is_bounds_query() const {
2355         return buf.is_bounds_query();
2356     }
2357 
2358     /** Convenient check to verify that all of the interesting bytes in the Buffer
2359      * are initialized under MSAN. Note that by default, we use for_each_value() here so that
2360      * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
2361      * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
2362      * the entire Buffer storage.) */
2363     void msan_check_mem_is_initialized(bool entire = false) const {
2364 #if defined(__has_feature)
2365 #if __has_feature(memory_sanitizer)
2366         if (entire) {
2367             __msan_check_mem_is_initialized(data(), size_in_bytes());
2368         } else {
2369             for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
2370         }
2371 #endif
2372 #endif
2373     }
2374 };
2375 
2376 }  // namespace Runtime
2377 }  // namespace Halide
2378 
2379 #undef HALIDE_ALLOCA
2380 
2381 #endif  // HALIDE_RUNTIME_IMAGE_H
2382