src/runtime/HalideBuffer.h

/** \file
 * Defines a Buffer type that wraps from halide_buffer_t and adds
 * functionality, and methods for more conveniently iterating over the
 * samples in a halide_buffer_t outside of Halide code. */

#ifndef HALIDE_RUNTIME_BUFFER_H
#define HALIDE_RUNTIME_BUFFER_H

#include <algorithm>
#include <atomic>
#include <cassert>
#include <limits>
#include <memory>
#include <stdint.h>
#include <string.h>
#include <vector>

#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#endif

#include "HalideRuntime.h"

#ifdef _MSC_VER
#include <malloc.h>
#define HALIDE_ALLOCA _alloca
#else
#define HALIDE_ALLOCA __builtin_alloca
#endif

// gcc 5.1 has a false positive warning on this code
#if __GNUC__ == 5 && __GNUC_MINOR__ == 1
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif

namespace Halide {
namespace Runtime {

// Forward-declare our Buffer class
template<typename T, int D>
class Buffer;

// A helper to check if a parameter pack is entirely implicitly
// int-convertible to use with std::enable_if
template<typename... Args>
struct AllInts : std::false_type {};

template<>
struct AllInts<> : std::true_type {};

template<typename T, typename... Args>
struct AllInts<T, Args...> {
    static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value;
};

// Floats and doubles are technically implicitly int-convertible, but
// doing so produces a warning we treat as an error, so just disallow
// it here.
template<typename... Args>
struct AllInts<float, Args...> : std::false_type {};

template<typename... Args>
struct AllInts<double, Args...> : std::false_type {};

// A helper to detect if there are any zeros in a container
namespace Internal {
template<typename Container>
bool any_zero(const Container &c) {
    for (int i : c) {
        if (i == 0) return true;
    }
    return false;
}
}  // namespace Internal

/** A struct acting as a header for allocations owned by the Buffer
 * class itself. */
struct AllocationHeader {
    void (*deallocate_fn)(void *);
    std::atomic<int> ref_count;

    // Note that ref_count always starts at 1
    AllocationHeader(void (*deallocate_fn)(void *))
        : deallocate_fn(deallocate_fn), ref_count(1) {
    }
};

/** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
enum struct BufferDeviceOwnership : int {
    Allocated,               ///> halide_device_free will be called when device ref count goes to zero
    WrappedNative,           ///> halide_device_detach_native will be called when device ref count goes to zero
    Unmanaged,               ///> No free routine will be called when device ref count goes to zero
    AllocatedDeviceAndHost,  ///> Call device_and_host_free when DevRefCount goes to zero.
    Cropped,                 ///> Call halide_device_release_crop when DevRefCount goes to zero.
};

/** A similar struct for managing device allocations. */
struct DeviceRefCount {
    // This is only ever constructed when there's something to manage,
    // so start at one.
    std::atomic<int> count{1};
    BufferDeviceOwnership ownership{BufferDeviceOwnership::Allocated};
};

/** A templated Buffer class that wraps halide_buffer_t and adds
 * functionality. When using Halide from C++, this is the preferred
 * way to create input and output buffers. The overhead of using this
 * class relative to a naked halide_buffer_t is minimal - it uses another
 * ~16 bytes on the stack, and does no dynamic allocations when using
 * it to represent existing memory of a known maximum dimensionality.
 *
 * The template parameter T is the element type. For buffers where the
 * element type is unknown, or may vary, use void or const void.
 *
 * D is the maximum number of dimensions that can be represented using
 * space inside the class itself. Set it to the maximum dimensionality
 * you expect this buffer to be. If the actual dimensionality exceeds
 * this, heap storage is allocated to track the shape of the buffer. D
 * defaults to 4, which should cover nearly all usage.
 *
 * The class optionally allocates and owns memory for the image using
 * a shared pointer allocated with the provided allocator. If they are
 * null, malloc and free are used.  Any device-side allocation is
 * considered as owned if and only if the host-side allocation is
 * owned. */
template<typename T = void, int D = 4>
class Buffer {
    /** The underlying halide_buffer_t */
    halide_buffer_t buf = {0};

    /** Some in-class storage for shape of the dimensions. */
    halide_dimension_t shape[D];

    /** The allocation owned by this Buffer. NULL if the Buffer does not
     * own the memory. */
    AllocationHeader *alloc = nullptr;

    /** A reference count for the device allocation owned by this
     * buffer. */
    mutable DeviceRefCount *dev_ref_count = nullptr;

    /** True if T is of type void or const void */
    static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value;

    /** A type function that adds a const qualifier if T is a const type. */
    template<typename T2>
    using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type;

    /** T unless T is (const) void, in which case (const)
     * uint8_t. Useful for providing return types for operator() */
    using not_void_T = typename std::conditional<T_is_void,
                                                 add_const_if_T_is_const<uint8_t>,
                                                 T>::type;

    /** T with constness removed. Useful for return type of copy(). */
    using not_const_T = typename std::remove_const<T>::type;

    /** The type the elements are stored as. Equal to not_void_T
     * unless T is a pointer, in which case uint64_t. Halide stores
     * all pointer types as uint64s internally, even on 32-bit
     * systems. */
    using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type;

public:
    /** True if the Halide type is not void (or const void). */
    static constexpr bool has_static_halide_type = !T_is_void;

    /** Get the Halide type of T. Callers should not use the result if
     * has_static_halide_type is false. */
    static halide_type_t static_halide_type() {
        return halide_type_of<typename std::remove_cv<not_void_T>::type>();
    }

    /** Does this Buffer own the host memory it refers to? */
    bool owns_host_memory() const {
        return alloc != nullptr;
    }

private:
    /** Increment the reference count of any owned allocation */
    void incref() const {
        if (owns_host_memory()) {
            alloc->ref_count++;
        }
        if (buf.device) {
            if (!dev_ref_count) {
                // I seem to have a non-zero dev field but no
                // reference count for it. I must have been given a
                // device allocation by a Halide pipeline, and have
                // never been copied from since. Take sole ownership
                // of it.
                dev_ref_count = new DeviceRefCount;
            }
            dev_ref_count->count++;
        }
    }

    // Note that this is called "cropped" but can also encompass a slice/embed
    // operation as well.
    struct DevRefCountCropped : DeviceRefCount {
        Buffer<T, D> cropped_from;
        DevRefCountCropped(const Buffer<T, D> &cropped_from)
            : cropped_from(cropped_from) {
            ownership = BufferDeviceOwnership::Cropped;
        }
    };

    /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
    void crop_from(const Buffer<T, D> &cropped_from) {
        assert(dev_ref_count == nullptr);
        dev_ref_count = new DevRefCountCropped(cropped_from);
    }

    /** Decrement the reference count of any owned allocation and free host
     * and device memory if it hits zero. Sets alloc to nullptr. */
    void decref() {
        if (owns_host_memory()) {
            int new_count = --(alloc->ref_count);
            if (new_count == 0) {
                void (*fn)(void *) = alloc->deallocate_fn;
                alloc->~AllocationHeader();
                fn(alloc);
            }
            buf.host = nullptr;
            alloc = nullptr;
            set_host_dirty(false);
        }
        decref_dev();
    }

    void decref_dev() {
        int new_count = 0;
        if (dev_ref_count) {
            new_count = --(dev_ref_count->count);
        }
        if (new_count == 0) {
            if (buf.device) {
                assert(!(alloc && device_dirty()) &&
                       "Implicitly freeing a dirty device allocation while a host allocation still lives. "
                       "Call device_free explicitly if you want to drop dirty device-side data. "
                       "Call copy_to_host explicitly if you want the data copied to the host allocation "
                       "before the device allocation is freed.");
                if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
                    buf.device_interface->detach_native(nullptr, &buf);
                } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
                    buf.device_interface->device_and_host_free(nullptr, &buf);
                } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                    buf.device_interface->device_release_crop(nullptr, &buf);
                } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
                    buf.device_interface->device_free(nullptr, &buf);
                }
            }
            if (dev_ref_count) {
                if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                    delete (DevRefCountCropped *)dev_ref_count;
                } else {
                    delete dev_ref_count;
                }
            }
        }
        buf.device = 0;
        buf.device_interface = nullptr;
        dev_ref_count = nullptr;
    }

    void free_shape_storage() {
        if (buf.dim != shape) {
            delete[] buf.dim;
            buf.dim = nullptr;
        }
    }

    void make_shape_storage(const int dimensions) {
        // This should usually be inlined, so if dimensions is statically known,
        // we can skip the call to new
        buf.dimensions = dimensions;
        buf.dim = (dimensions <= D) ? shape : new halide_dimension_t[dimensions];
    }

    void copy_shape_from(const halide_buffer_t &other) {
        // All callers of this ensure that buf.dimensions == other.dimensions.
        make_shape_storage(other.dimensions);
        std::copy(other.dim, other.dim + other.dimensions, buf.dim);
    }

    template<typename T2, int D2>
    void move_shape_from(Buffer<T2, D2> &&other) {
        if (other.shape == other.buf.dim) {
            copy_shape_from(other.buf);
        } else {
            buf.dim = other.buf.dim;
            other.buf.dim = nullptr;
        }
    }

    /** Initialize the shape from a halide_buffer_t. */
    void initialize_from_buffer(const halide_buffer_t &b,
                                BufferDeviceOwnership ownership) {
        memcpy(&buf, &b, sizeof(halide_buffer_t));
        copy_shape_from(b);
        if (b.device) {
            dev_ref_count = new DeviceRefCount;
            dev_ref_count->ownership = ownership;
        }
    }

    /** Initialize the shape from an array of ints */
    void initialize_shape(const int *sizes) {
        for (int i = 0; i < buf.dimensions; i++) {
            buf.dim[i].min = 0;
            buf.dim[i].extent = sizes[i];
            if (i == 0) {
                buf.dim[i].stride = 1;
            } else {
                buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
            }
        }
    }

    /** Initialize the shape from a vector of extents */
    void initialize_shape(const std::vector<int> &sizes) {
        assert(buf.dimensions == (int)sizes.size());
        initialize_shape(sizes.data());
    }

    /** Initialize the shape from the static shape of an array */
    template<typename Array, size_t N>
    void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
        buf.dim[next].min = 0;
        buf.dim[next].extent = (int)N;
        if (next == 0) {
            buf.dim[next].stride = 1;
        } else {
            initialize_shape_from_array_shape(next - 1, vals[0]);
            buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
        }
    }

    /** Base case for the template recursion above. */
    template<typename T2>
    void initialize_shape_from_array_shape(int, const T2 &) {
    }

    /** Get the dimensionality of a multi-dimensional C array */
    template<typename Array, size_t N>
    static int dimensionality_of_array(Array (&vals)[N]) {
        return dimensionality_of_array(vals[0]) + 1;
    }

    template<typename T2>
    static int dimensionality_of_array(const T2 &) {
        return 0;
    }

    /** Get the underlying halide_type_t of an array's element type. */
    template<typename Array, size_t N>
    static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
        return scalar_type_of_array(vals[0]);
    }

    template<typename T2>
    static halide_type_t scalar_type_of_array(const T2 &) {
        return halide_type_of<typename std::remove_cv<T2>::type>();
    }

    /** Crop a single dimension without handling device allocation. */
    void crop_host(int d, int min, int extent) {
        assert(dim(d).min() <= min);
        assert(dim(d).max() >= min + extent - 1);
        int shift = min - dim(d).min();
        if (buf.host != nullptr) {
            buf.host += shift * dim(d).stride() * type().bytes();
        }
        buf.dim[d].min = min;
        buf.dim[d].extent = extent;
    }

    /** Crop as many dimensions as are in rect, without handling device allocation. */
    void crop_host(const std::vector<std::pair<int, int>> &rect) {
        assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
        int limit = (int)rect.size();
        assert(limit <= dimensions());
        for (int i = 0; i < limit; i++) {
            crop_host(i, rect[i].first, rect[i].second);
        }
    }

    void complete_device_crop(Buffer<T, D> &result_host_cropped) const {
        assert(buf.device_interface != nullptr);
        if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == 0) {
            const Buffer<T, D> *cropped_from = this;
            // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
            // is it possible to get to this point without incref having run at least once since
            // the device field was set? (I.e. in the internal logic of crop. incref might have been
            // called.)
            if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                cropped_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
            }
            result_host_cropped.crop_from(*cropped_from);
        }
    }

    /** slice a single dimension without handling device allocation. */
    void slice_host(int d, int pos) {
        assert(d >= 0 && d < dimensions());
        assert(pos >= dim(d).min() && pos <= dim(d).max());
        buf.dimensions--;
        int shift = pos - buf.dim[d].min;
        if (buf.host != nullptr) {
            buf.host += shift * buf.dim[d].stride * type().bytes();
        }
        for (int i = d; i < buf.dimensions; i++) {
            buf.dim[i] = buf.dim[i + 1];
        }
        buf.dim[buf.dimensions] = {0, 0, 0};
    }

    void complete_device_slice(Buffer<T, D> &result_host_sliced, int d, int pos) const {
        assert(buf.device_interface != nullptr);
        if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == 0) {
            const Buffer<T, D> *sliced_from = this;
            // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
            // is it possible to get to this point without incref having run at least once since
            // the device field was set? (I.e. in the internal logic of slice. incref might have been
            // called.)
            if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                sliced_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
            }
            // crop_from() is correct here, despite the fact that we are slicing.
            result_host_sliced.crop_from(*sliced_from);
        }
    }

public:
    typedef T ElemType;

    /** Read-only access to the shape */
    class Dimension {
        const halide_dimension_t &d;

    public:
        /** The lowest coordinate in this dimension */
        HALIDE_ALWAYS_INLINE int min() const {
            return d.min;
        }

        /** The number of elements in memory you have to step over to
         * increment this coordinate by one. */
        HALIDE_ALWAYS_INLINE int stride() const {
            return d.stride;
        }

        /** The extent of the image along this dimension */
        HALIDE_ALWAYS_INLINE int extent() const {
            return d.extent;
        }

        /** The highest coordinate in this dimension */
        HALIDE_ALWAYS_INLINE int max() const {
            return min() + extent() - 1;
        }

        /** An iterator class, so that you can iterate over
         * coordinates in a dimensions using a range-based for loop. */
        struct iterator {
            int val;
            int operator*() const {
                return val;
            }
            bool operator!=(const iterator &other) const {
                return val != other.val;
            }
            iterator &operator++() {
                val++;
                return *this;
            }
        };

        /** An iterator that points to the min coordinate */
        HALIDE_ALWAYS_INLINE iterator begin() const {
            return {min()};
        }

        /** An iterator that points to one past the max coordinate */
        HALIDE_ALWAYS_INLINE iterator end() const {
            return {min() + extent()};
        }

        Dimension(const halide_dimension_t &dim)
            : d(dim){};
    };

    /** Access the shape of the buffer */
    HALIDE_ALWAYS_INLINE Dimension dim(int i) const {
        assert(i >= 0 && i < this->dimensions());
        return Dimension(buf.dim[i]);
    }

    /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
    // @{
    int min(int i) const {
        return dim(i).min();
    }
    int extent(int i) const {
        return dim(i).extent();
    }
    int stride(int i) const {
        return dim(i).stride();
    }
    // @}

    /** The total number of elements this buffer represents. Equal to
     * the product of the extents */
    size_t number_of_elements() const {
        size_t s = 1;
        for (int i = 0; i < dimensions(); i++) {
            s *= dim(i).extent();
        }
        return s;
    }

    /** Get the dimensionality of the buffer. */
    int dimensions() const {
        return buf.dimensions;
    }

    /** Get the type of the elements. */
    halide_type_t type() const {
        return buf.type;
    }

private:
    /** Offset to the element with the lowest address. If all
     * strides are positive, equal to zero. Offset is in elements, not bytes. */
    ptrdiff_t begin_offset() const {
        ptrdiff_t index = 0;
        for (int i = 0; i < dimensions(); i++) {
            if (dim(i).stride() < 0) {
                index += dim(i).stride() * (dim(i).extent() - 1);
            }
        }
        return index;
    }

    /** An offset to one beyond the element with the highest address.
     * Offset is in elements, not bytes. */
    ptrdiff_t end_offset() const {
        ptrdiff_t index = 0;
        for (int i = 0; i < dimensions(); i++) {
            if (dim(i).stride() > 0) {
                index += dim(i).stride() * (dim(i).extent() - 1);
            }
        }
        index += 1;
        return index;
    }

public:
    /** A pointer to the element with the lowest address. If all
     * strides are positive, equal to the host pointer. */
    T *begin() const {
        assert(buf.host != nullptr);  // Cannot call begin() on an unallocated Buffer.
        return (T *)(buf.host + begin_offset() * type().bytes());
    }

    /** A pointer to one beyond the element with the highest address. */
    T *end() const {
        assert(buf.host != nullptr);  // Cannot call end() on an unallocated Buffer.
        return (T *)(buf.host + end_offset() * type().bytes());
    }

    /** The total number of bytes spanned by the data in memory. */
    size_t size_in_bytes() const {
        return (size_t)(end_offset() - begin_offset()) * type().bytes();
    }

    /** Reset the Buffer to be equivalent to a default-constructed Buffer
     * of the same static type (if any); Buffer<void> will have its runtime
     * type reset to uint8. */
    void reset() {
        *this = Buffer();
    }

    Buffer()
        : shape() {
        buf.type = static_halide_type();
        make_shape_storage(0);
    }

    /** Make a Buffer from a halide_buffer_t */
    explicit Buffer(const halide_buffer_t &buf,
                    BufferDeviceOwnership ownership = BufferDeviceOwnership::Unmanaged) {
        assert(T_is_void || buf.type == static_halide_type());
        initialize_from_buffer(buf, ownership);
    }

    /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
    template<typename T2, int D2>
    friend class Buffer;

private:
    template<typename T2, int D2>
    static void static_assert_can_convert_from() {
        static_assert((!std::is_const<T2>::value || std::is_const<T>::value),
                      "Can't convert from a Buffer<const T> to a Buffer<T>");
        static_assert(std::is_same<typename std::remove_const<T>::type,
                                   typename std::remove_const<T2>::type>::value ||
                          T_is_void || Buffer<T2, D2>::T_is_void,
                      "type mismatch constructing Buffer");
    }

public:
    /** Determine if if an Buffer<T, D> can be constructed from some other Buffer type.
     * If this can be determined at compile time, fail with a static assert; otherwise
     * return a boolean based on runtime typing. */
    template<typename T2, int D2>
    static bool can_convert_from(const Buffer<T2, D2> &other) {
        static_assert_can_convert_from<T2, D2>();
        if (Buffer<T2, D2>::T_is_void && !T_is_void) {
            return other.type() == static_halide_type();
        }
        return true;
    }

    /** Fail an assertion at runtime or compile-time if an Buffer<T, D>
     * cannot be constructed from some other Buffer type. */
    template<typename T2, int D2>
    static void assert_can_convert_from(const Buffer<T2, D2> &other) {
        // Explicitly call static_assert_can_convert_from() here so
        // that we always get compile-time checking, even if compiling with
        // assertions disabled.
        static_assert_can_convert_from<T2, D2>();
        assert(can_convert_from(other));
    }

    /** Copy constructor. Does not copy underlying data. */
    Buffer(const Buffer<T, D> &other)
        : buf(other.buf),
          alloc(other.alloc) {
        other.incref();
        dev_ref_count = other.dev_ref_count;
        copy_shape_from(other.buf);
    }

    /** Construct a Buffer from a Buffer of different dimensionality
     * and type. Asserts that the type matches (at runtime, if one of
     * the types is void). Note that this constructor is
     * implicit. This, for example, lets you pass things like
     * Buffer<T> or Buffer<const void> to functions expected
     * Buffer<const T>. */
    template<typename T2, int D2>
    Buffer(const Buffer<T2, D2> &other)
        : buf(other.buf),
          alloc(other.alloc) {
        assert_can_convert_from(other);
        other.incref();
        dev_ref_count = other.dev_ref_count;
        copy_shape_from(other.buf);
    }

    /** Move constructor */
    Buffer(Buffer<T, D> &&other) noexcept
        : buf(other.buf),
          alloc(other.alloc),
          dev_ref_count(other.dev_ref_count) {
        other.dev_ref_count = nullptr;
        other.alloc = nullptr;
        move_shape_from(std::forward<Buffer<T, D>>(other));
        other.buf = halide_buffer_t();
    }

    /** Move-construct a Buffer from a Buffer of different
     * dimensionality and type. Asserts that the types match (at
     * runtime if one of the types is void). */
    template<typename T2, int D2>
    Buffer(Buffer<T2, D2> &&other)
        : buf(other.buf),
          alloc(other.alloc),
          dev_ref_count(other.dev_ref_count) {
        assert_can_convert_from(other);
        other.dev_ref_count = nullptr;
        other.alloc = nullptr;
        move_shape_from(std::forward<Buffer<T2, D2>>(other));
        other.buf = halide_buffer_t();
    }

    /** Assign from another Buffer of possibly-different
     * dimensionality and type. Asserts that the types match (at
     * runtime if one of the types is void). */
    template<typename T2, int D2>
    Buffer<T, D> &operator=(const Buffer<T2, D2> &other) {
        if ((const void *)this == (const void *)&other) {
            return *this;
        }
        assert_can_convert_from(other);
        other.incref();
        decref();
        dev_ref_count = other.dev_ref_count;
        alloc = other.alloc;
        free_shape_storage();
        buf = other.buf;
        copy_shape_from(other.buf);
        return *this;
    }

    /** Standard assignment operator */
    Buffer<T, D> &operator=(const Buffer<T, D> &other) {
        if (this == &other) {
            return *this;
        }
        other.incref();
        decref();
        dev_ref_count = other.dev_ref_count;
        alloc = other.alloc;
        free_shape_storage();
        buf = other.buf;
        copy_shape_from(other.buf);
        return *this;
    }

    /** Move from another Buffer of possibly-different
     * dimensionality and type. Asserts that the types match (at
     * runtime if one of the types is void). */
    template<typename T2, int D2>
    Buffer<T, D> &operator=(Buffer<T2, D2> &&other) {
        assert_can_convert_from(other);
        decref();
        alloc = other.alloc;
        other.alloc = nullptr;
        dev_ref_count = other.dev_ref_count;
        other.dev_ref_count = nullptr;
        free_shape_storage();
        buf = other.buf;
        move_shape_from(std::forward<Buffer<T2, D2>>(other));
        other.buf = halide_buffer_t();
        return *this;
    }

    /** Standard move-assignment operator */
    Buffer<T, D> &operator=(Buffer<T, D> &&other) noexcept {
        decref();
        alloc = other.alloc;
        other.alloc = nullptr;
        dev_ref_count = other.dev_ref_count;
        other.dev_ref_count = nullptr;
        free_shape_storage();
        buf = other.buf;
        move_shape_from(std::forward<Buffer<T, D>>(other));
        other.buf = halide_buffer_t();
        return *this;
    }

    /** Check the product of the extents fits in memory. */
    void check_overflow() {
        size_t size = type().bytes();
        for (int i = 0; i < dimensions(); i++) {
            size *= dim(i).extent();
        }
        // We allow 2^31 or 2^63 bytes, so drop the top bit.
        size = (size << 1) >> 1;
        for (int i = 0; i < dimensions(); i++) {
            size /= dim(i).extent();
        }
        assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
    }

    /** Allocate memory for this Buffer. Drops the reference to any
     * owned memory. */
    void allocate(void *(*allocate_fn)(size_t) = nullptr,
                  void (*deallocate_fn)(void *) = nullptr) {
        if (!allocate_fn) {
            allocate_fn = malloc;
        }
        if (!deallocate_fn) {
            deallocate_fn = free;
        }

        // Drop any existing allocation
        deallocate();

        // Conservatively align images to 128 bytes. This is enough
        // alignment for all the platforms we might use.
        size_t size = size_in_bytes();
        const size_t alignment = 128;
        size = (size + alignment - 1) & ~(alignment - 1);
        void *alloc_storage = allocate_fn(size + sizeof(AllocationHeader) + alignment - 1);
        alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
        uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
        buf.host = (uint8_t *)((uintptr_t)(unaligned_ptr + alignment - 1) & ~(alignment - 1));
    }

    /** Drop reference to any owned host or device memory, possibly
     * freeing it, if this buffer held the last reference to
     * it. Retains the shape of the buffer. Does nothing if this
     * buffer did not allocate its own memory. */
    void deallocate() {
        decref();
    }

    /** Drop reference to any owned device memory, possibly freeing it
     * if this buffer held the last reference to it. Asserts that
     * device_dirty is false. */
    void device_deallocate() {
        decref_dev();
    }

    /** Allocate a new image of the given size with a runtime
     * type. Only used when you do know what size you want but you
     * don't know statically what type the elements are. Pass zeroes
     * to make a buffer suitable for bounds query calls. */
    template<typename... Args,
             typename = typename std::enable_if<AllInts<Args...>::value>::type>
    Buffer(halide_type_t t, int first, Args... rest) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        int extents[] = {first, (int)rest...};
        buf.type = t;
        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
        make_shape_storage(buf_dimensions);
        initialize_shape(extents);
        if (!Internal::any_zero(extents)) {
            check_overflow();
            allocate();
        }
    }

    /** Allocate a new image of the given size. Pass zeroes to make a
     * buffer suitable for bounds query calls. */
    // @{

    // The overload with one argument is 'explicit', so that
    // (say) int is not implicitly convertable to Buffer<int>
    explicit Buffer(int first) {
        static_assert(!T_is_void,
                      "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
        int extents[] = {first};
        buf.type = static_halide_type();
        constexpr int buf_dimensions = 1;
        make_shape_storage(buf_dimensions);
        initialize_shape(extents);
        if (first != 0) {
            check_overflow();
            allocate();
        }
    }

    template<typename... Args,
             typename = typename std::enable_if<AllInts<Args...>::value>::type>
    Buffer(int first, int second, Args... rest) {
        static_assert(!T_is_void,
                      "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
        int extents[] = {first, second, (int)rest...};
        buf.type = static_halide_type();
        constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
        make_shape_storage(buf_dimensions);
        initialize_shape(extents);
        if (!Internal::any_zero(extents)) {
            check_overflow();
            allocate();
        }
    }
    // @}

    /** Allocate a new image of unknown type using a vector of ints as the size. */
    Buffer(halide_type_t t, const std::vector<int> &sizes) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        buf.type = t;
        make_shape_storage((int)sizes.size());
        initialize_shape(sizes);
        if (!Internal::any_zero(sizes)) {
            check_overflow();
            allocate();
        }
    }

    /** Allocate a new image of known type using a vector of ints as the size. */
    explicit Buffer(const std::vector<int> &sizes)
        : Buffer(static_halide_type(), sizes) {
    }

private:
    // Create a copy of the sizes vector, ordered as specified by order.
    static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
        assert(order.size() == sizes.size());
        std::vector<int> ordered_sizes(sizes.size());
        for (size_t i = 0; i < sizes.size(); ++i) {
            ordered_sizes[i] = sizes.at(order[i]);
        }
        return ordered_sizes;
    }

public:
    /** Allocate a new image of unknown type using a vector of ints as the size and
     * a vector of indices indicating the storage order for each dimension. The
     * length of the sizes vector and the storage-order vector must match. For instance,
     * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
    Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
        : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
        transpose(storage_order);
    }

    Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
        : Buffer(static_halide_type(), sizes, storage_order) {
    }

    /** Make an Buffer that refers to a statically sized array. Does not
     * take ownership of the data, and does not set the host_dirty flag. */
    template<typename Array, size_t N>
    explicit Buffer(Array (&vals)[N]) {
        const int buf_dimensions = dimensionality_of_array(vals);
        buf.type = scalar_type_of_array(vals);
        buf.host = (uint8_t *)vals;
        make_shape_storage(buf_dimensions);
        initialize_shape_from_array_shape(buf.dimensions - 1, vals);
    }

    /** Initialize an Buffer of runtime type from a pointer and some
     * sizes. Assumes dense row-major packing and a min coordinate of
     * zero. Does not take ownership of the data and does not set the
     * host_dirty flag. */
    template<typename... Args,
             typename = typename std::enable_if<AllInts<Args...>::value>::type>
    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&... rest) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        int extents[] = {first, (int)rest...};
        buf.type = t;
        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
        buf.host = (uint8_t *)const_cast<void *>(data);
        make_shape_storage(buf_dimensions);
        initialize_shape(extents);
    }

    /** Initialize an Buffer from a pointer and some sizes. Assumes
     * dense row-major packing and a min coordinate of zero. Does not
     * take ownership of the data and does not set the host_dirty flag. */
    template<typename... Args,
             typename = typename std::enable_if<AllInts<Args...>::value>::type>
    explicit Buffer(T *data, int first, Args &&... rest) {
        int extents[] = {first, (int)rest...};
        buf.type = static_halide_type();
        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
        buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
        make_shape_storage(buf_dimensions);
        initialize_shape(extents);
    }

    /** Initialize an Buffer from a pointer and a vector of
     * sizes. Assumes dense row-major packing and a min coordinate of
     * zero. Does not take ownership of the data and does not set the
     * host_dirty flag. */
    explicit Buffer(T *data, const std::vector<int> &sizes) {
        buf.type = static_halide_type();
        buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
        make_shape_storage((int)sizes.size());
        initialize_shape(sizes);
    }

    /** Initialize an Buffer of runtime type from a pointer and a
     * vector of sizes. Assumes dense row-major packing and a min
     * coordinate of zero. Does not take ownership of the data and
     * does not set the host_dirty flag. */
    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        buf.type = t;
        buf.host = (uint8_t *)const_cast<void *>(data);
        make_shape_storage((int)sizes.size());
        initialize_shape(sizes);
    }

    /** Initialize an Buffer from a pointer to the min coordinate and
     * an array describing the shape.  Does not take ownership of the
     * data, and does not set the host_dirty flag. */
    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        buf.type = t;
        buf.host = (uint8_t *)const_cast<void *>(data);
        make_shape_storage(d);
        for (int i = 0; i < d; i++) {
            buf.dim[i] = shape[i];
        }
    }

    /** Initialize a Buffer from a pointer to the min coordinate and
     * a vector describing the shape.  Does not take ownership of the
     * data, and does not set the host_dirty flag. */
    explicit inline Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
                           const std::vector<halide_dimension_t> &shape)
        : Buffer(t, data, (int)shape.size(), shape.data()) {
    }

    /** Initialize an Buffer from a pointer to the min coordinate and
     * an array describing the shape.  Does not take ownership of the
     * data and does not set the host_dirty flag. */
    explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
        buf.type = static_halide_type();
        buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
        make_shape_storage(d);
        for (int i = 0; i < d; i++) {
            buf.dim[i] = shape[i];
        }
    }

    /** Initialize a Buffer from a pointer to the min coordinate and
     * a vector describing the shape.  Does not take ownership of the
     * data, and does not set the host_dirty flag. */
    explicit inline Buffer(T *data, const std::vector<halide_dimension_t> &shape)
        : Buffer(data, (int)shape.size(), shape.data()) {
    }

    /** Destructor. Will release any underlying owned allocation if
     * this is the last reference to it. Will assert fail if there are
     * weak references to this Buffer outstanding. */
    ~Buffer() {
        free_shape_storage();
        decref();
    }

    /** Get a pointer to the raw halide_buffer_t this wraps. */
    // @{
    halide_buffer_t *raw_buffer() {
        return &buf;
    }

    const halide_buffer_t *raw_buffer() const {
        return &buf;
    }
    // @}

    /** Provide a cast operator to halide_buffer_t *, so that
     * instances can be passed directly to Halide filters. */
    operator halide_buffer_t *() {
        return &buf;
    }

    /** Return a typed reference to this Buffer. Useful for converting
     * a reference to a Buffer<void> to a reference to, for example, a
     * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
     * Does a runtime assert if the source buffer type is void. */
    template<typename T2, int D2 = D,
             typename = typename std::enable_if<(D2 <= D)>::type>
    HALIDE_ALWAYS_INLINE
        Buffer<T2, D2> &
        as() & {
        Buffer<T2, D>::assert_can_convert_from(*this);
        return *((Buffer<T2, D2> *)this);
    }

    /** Return a const typed reference to this Buffer. Useful for
     * converting a conference reference to one Buffer type to a const
     * reference to another Buffer type. Does a runtime assert if the
     * source buffer type is void. */
    template<typename T2, int D2 = D,
             typename = typename std::enable_if<(D2 <= D)>::type>
    HALIDE_ALWAYS_INLINE const Buffer<T2, D2> &as() const & {
        Buffer<T2, D>::assert_can_convert_from(*this);
        return *((const Buffer<T2, D2> *)this);
    }

    /** Returns this rval Buffer with a different type attached. Does
     * a dynamic type check if the source type is void. */
    template<typename T2, int D2 = D>
    HALIDE_ALWAYS_INLINE
        Buffer<T2, D2>
        as() && {
        Buffer<T2, D2>::assert_can_convert_from(*this);
        return *((Buffer<T2, D2> *)this);
    }

    /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
     * to recapitulate the type argument. */
    // @{
    HALIDE_ALWAYS_INLINE
    Buffer<typename std::add_const<T>::type, D> &as_const() & {
        // Note that we can skip the assert_can_convert_from(), since T -> const T
        // conversion is always legal.
        return *((Buffer<typename std::add_const<T>::type> *)this);
    }

    HALIDE_ALWAYS_INLINE
    const Buffer<typename std::add_const<T>::type, D> &as_const() const & {
        return *((const Buffer<typename std::add_const<T>::type> *)this);
    }

    HALIDE_ALWAYS_INLINE
    Buffer<typename std::add_const<T>::type, D> as_const() && {
        return *((Buffer<typename std::add_const<T>::type> *)this);
    }
    // @}

    /** Conventional names for the first three dimensions. */
    // @{
    int width() const {
        return (dimensions() > 0) ? dim(0).extent() : 1;
    }
    int height() const {
        return (dimensions() > 1) ? dim(1).extent() : 1;
    }
    int channels() const {
        return (dimensions() > 2) ? dim(2).extent() : 1;
    }
    // @}

    /** Conventional names for the min and max value of each dimension */
    // @{
    int left() const {
        return dim(0).min();
    }

    int right() const {
        return dim(0).max();
    }

    int top() const {
        return dim(1).min();
    }

    int bottom() const {
        return dim(1).max();
    }
    // @}

    /** Make a new image which is a deep copy of this image. Use crop
     * or slice followed by copy to make a copy of only a portion of
     * the image. The new image uses the same memory layout as the
     * original, with holes compacted away. Note that the returned
     * Buffer is always of a non-const type T (ie:
     *
     *     Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
     *
     * which is always safe, since we are making a deep copy. (The caller
     * can easily cast it back to Buffer<const T> if desired, which is
     * always safe and free.)
     */
    Buffer<not_const_T, D> copy(void *(*allocate_fn)(size_t) = nullptr,
                                void (*deallocate_fn)(void *) = nullptr) const {
        Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_with_shape_of(*this, allocate_fn, deallocate_fn);
        dst.copy_from(*this);
        return dst;
    }

    /** Like copy(), but the copy is created in interleaved memory layout
     * (vs. keeping the same memory layout as the original). Requires that 'this'
     * has exactly 3 dimensions.
     */
    Buffer<not_const_T, D> copy_to_interleaved(void *(*allocate_fn)(size_t) = nullptr,
                                               void (*deallocate_fn)(void *) = nullptr) const {
        assert(dimensions() == 3);
        Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_interleaved(nullptr, width(), height(), channels());
        dst.set_min(min(0), min(1), min(2));
        dst.allocate(allocate_fn, deallocate_fn);
        dst.copy_from(*this);
        return dst;
    }

    /** Like copy(), but the copy is created in planar memory layout
     * (vs. keeping the same memory layout as the original).
     */
    Buffer<not_const_T, D> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
                                          void (*deallocate_fn)(void *) = nullptr) const {
        std::vector<int> mins, extents;
        const int dims = dimensions();
        mins.reserve(dims);
        extents.reserve(dims);
        for (int d = 0; d < dims; ++d) {
            mins.push_back(dim(d).min());
            extents.push_back(dim(d).extent());
        }
        Buffer<not_const_T, D> dst = Buffer<not_const_T, D>(nullptr, extents);
        dst.set_min(mins);
        dst.allocate(allocate_fn, deallocate_fn);
        dst.copy_from(*this);
        return dst;
    }

    /** Make a copy of the Buffer which shares the underlying host and/or device
     * allocations as the existing Buffer. This is purely syntactic sugar for
     * cases where you have a const reference to a Buffer but need a temporary
     * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
     * inline way to create a temporary. \code
     * void call_my_func(const Buffer<const uint8_t>& input) {
     *     my_func(input.alias(), output);
     * }\endcode
     */
    inline Buffer<T, D> alias() const {
        return *this;
    }

    /** Fill a Buffer with the values at the same coordinates in
     * another Buffer. Restricts itself to coordinates contained
     * within the intersection of the two buffers. If the two Buffers
     * are not in the same coordinate system, you will need to
     * translate the argument Buffer first. E.g. if you're blitting a
     * sprite onto a framebuffer, you'll want to translate the sprite
     * to the correct location first like so: \code
     * framebuffer.copy_from(sprite.translated({x, y})); \endcode
    */
    template<typename T2, int D2>
    void copy_from(const Buffer<T2, D2> &other) {
        static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>");
        assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
        assert(!other.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");

        Buffer<const T, D> src(other);
        Buffer<T, D> dst(*this);

        assert(src.dimensions() == dst.dimensions());

        // Trim the copy to the region in common
        for (int i = 0; i < dimensions(); i++) {
            int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
            int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
            if (max_coord < min_coord) {
                // The buffers do not overlap.
                return;
            }
            dst.crop(i, min_coord, max_coord - min_coord + 1);
            src.crop(i, min_coord, max_coord - min_coord + 1);
        }

        // If T is void, we need to do runtime dispatch to an
        // appropriately-typed lambda. We're copying, so we only care
        // about the element size. (If not, this should optimize away
        // into a static dispatch to the right-sized copy.)
        if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
            using MemType = uint8_t;
            auto &typed_dst = (Buffer<MemType, D> &)dst;
            auto &typed_src = (Buffer<const MemType, D> &)src;
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
            using MemType = uint16_t;
            auto &typed_dst = (Buffer<MemType, D> &)dst;
            auto &typed_src = (Buffer<const MemType, D> &)src;
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
            using MemType = uint32_t;
            auto &typed_dst = (Buffer<MemType, D> &)dst;
            auto &typed_src = (Buffer<const MemType, D> &)src;
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
            using MemType = uint64_t;
            auto &typed_dst = (Buffer<MemType, D> &)dst;
            auto &typed_src = (Buffer<const MemType, D> &)src;
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else {
            assert(false && "type().bytes() must be 1, 2, 4, or 8");
        }
        set_host_dirty();
    }

    /** Make an image that refers to a sub-range of this image along
     * the given dimension. Asserts that the crop region is within
     * the existing bounds: you cannot "crop outwards", even if you know there
     * is valid Buffer storage (e.g. because you already cropped inwards). */
    Buffer<T, D> cropped(int d, int min, int extent) const {
        // Make a fresh copy of the underlying buffer (but not a fresh
        // copy of the allocation, if there is one).
        Buffer<T, D> im = *this;

        // This guarantees the prexisting device ref is dropped if the
        // device_crop call fails and maintains the buffer in a consistent
        // state.
        im.device_deallocate();

        im.crop_host(d, min, extent);
        if (buf.device_interface != nullptr) {
            complete_device_crop(im);
        }
        return im;
    }

    /** Crop an image in-place along the given dimension. This does
     * not move any data around in memory - it just changes the min
     * and extent of the given dimension. */
    void crop(int d, int min, int extent) {
        // An optimization for non-device buffers. For the device case,
        // a temp buffer is required, so reuse the not-in-place version.
        // TODO(zalman|abadams): Are nop crops common enough to special
        // case the device part of the if to do nothing?
        if (buf.device_interface != nullptr) {
            *this = cropped(d, min, extent);
        } else {
            crop_host(d, min, extent);
        }
    }

    /** Make an image that refers to a sub-rectangle of this image along
     * the first N dimensions. Asserts that the crop region is within
     * the existing bounds. The cropped image may drop any device handle
     * if the device_interface cannot accomplish the crop in-place. */
    Buffer<T, D> cropped(const std::vector<std::pair<int, int>> &rect) const {
        // Make a fresh copy of the underlying buffer (but not a fresh
        // copy of the allocation, if there is one).
        Buffer<T, D> im = *this;

        // This guarantees the prexisting device ref is dropped if the
        // device_crop call fails and maintains the buffer in a consistent
        // state.
        im.device_deallocate();

        im.crop_host(rect);
        if (buf.device_interface != nullptr) {
            complete_device_crop(im);
        }
        return im;
    }

    /** Crop an image in-place along the first N dimensions. This does
     * not move any data around in memory, nor does it free memory. It
     * just rewrites the min/extent of each dimension to refer to a
     * subregion of the same allocation. */
    void crop(const std::vector<std::pair<int, int>> &rect) {
        // An optimization for non-device buffers. For the device case,
        // a temp buffer is required, so reuse the not-in-place version.
        // TODO(zalman|abadams): Are nop crops common enough to special
        // case the device part of the if to do nothing?
        if (buf.device_interface != nullptr) {
            *this = cropped(rect);
        } else {
            crop_host(rect);
        }
    }

    /** Make an image which refers to the same data with using
     * translated coordinates in the given dimension. Positive values
     * move the image data to the right or down relative to the
     * coordinate system. Drops any device handle. */
    Buffer<T, D> translated(int d, int dx) const {
        Buffer<T, D> im = *this;
        im.translate(d, dx);
        return im;
    }

    /** Translate an image in-place along one dimension by changing
     * how it is indexed. Does not move any data around in memory. */
    void translate(int d, int delta) {
        assert(d >= 0 && d < this->dimensions());
        device_deallocate();
        buf.dim[d].min += delta;
    }

    /** Make an image which refers to the same data translated along
     * the first N dimensions. */
    Buffer<T, D> translated(const std::vector<int> &delta) const {
        Buffer<T, D> im = *this;
        im.translate(delta);
        return im;
    }

    /** Translate an image along the first N dimensions by changing
     * how it is indexed. Does not move any data around in memory. */
    void translate(const std::vector<int> &delta) {
        device_deallocate();
        assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
        int limit = (int)delta.size();
        assert(limit <= dimensions());
        for (int i = 0; i < limit; i++) {
            translate(i, delta[i]);
        }
    }

    /** Set the min coordinate of an image in the first N dimensions. */
    // @{
    void set_min(const std::vector<int> &mins) {
        assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
        device_deallocate();
        for (size_t i = 0; i < mins.size(); i++) {
            buf.dim[i].min = mins[i];
        }
    }

    template<typename... Args>
    void set_min(Args... args) {
        set_min(std::vector<int>{args...});
    }
    // @}

    /** Test if a given coordinate is within the bounds of an image. */
    // @{
    bool contains(const std::vector<int> &coords) const {
        assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
        for (size_t i = 0; i < coords.size(); i++) {
            if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
                return false;
            }
        }
        return true;
    }

    template<typename... Args>
    bool contains(Args... args) const {
        return contains(std::vector<int>{args...});
    }
    // @}

    /** Make a buffer which refers to the same data in the same layout
     * using a swapped indexing order for the dimensions given. So
     * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
     * strongly that A.address_of(i, j) == B.address_of(j, i). */
    Buffer<T, D> transposed(int d1, int d2) const {
        Buffer<T, D> im = *this;
        im.transpose(d1, d2);
        return im;
    }

    /** Transpose a buffer in-place by changing how it is indexed. For
     * example, transpose(0, 1) on a two-dimensional buffer means that
     * the value referred to by coordinates (i, j) is now reached at
     * the coordinates (j, i), and vice versa. This is done by
     * reordering the per-dimension metadata rather than by moving
     * data around in memory, so other views of the same memory will
     * not see the data as having been transposed. */
    void transpose(int d1, int d2) {
        assert(d1 >= 0 && d1 < this->dimensions());
        assert(d2 >= 0 && d2 < this->dimensions());
        std::swap(buf.dim[d1], buf.dim[d2]);
    }

    /** A generalized transpose: instead of swapping two dimensions,
     * pass a vector that lists each dimension index exactly once, in
     * the desired order. This does not move any data around in memory
     * - it just permutes how it is indexed. */
    void transpose(const std::vector<int> &order) {
        assert((int)order.size() == dimensions());
        if (dimensions() < 2) {
            // My, that was easy
            return;
        }

        std::vector<int> order_sorted = order;
        for (size_t i = 1; i < order_sorted.size(); i++) {
            for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
                std::swap(order_sorted[j], order_sorted[j - 1]);
                transpose(j, j - 1);
            }
        }
    }

    /** Make a buffer which refers to the same data in the same
     * layout using a different ordering of the dimensions. */
    Buffer<T, D> transposed(const std::vector<int> &order) const {
        Buffer<T, D> im = *this;
        im.transpose(order);
        return im;
    }

    /** Make a lower-dimensional buffer that refers to one slice of
     * this buffer. */
    Buffer<T, D> sliced(int d, int pos) const {
        Buffer<T, D> im = *this;

        // This guarantees the prexisting device ref is dropped if the
        // device_slice call fails and maintains the buffer in a consistent
        // state.
        im.device_deallocate();

        im.slice_host(d, pos);
        if (buf.device_interface != nullptr) {
            complete_device_slice(im, d, pos);
        }
        return im;
    }

    /** Make a lower-dimensional buffer that refers to one slice of this
     * buffer at the dimension's minimum. */
    inline Buffer<T, D> sliced(int d) const {
        return sliced(d, dim(d).min());
    }

    /** Rewrite the buffer to refer to a single lower-dimensional
     * slice of itself along the given dimension at the given
     * coordinate. Does not move any data around or free the original
     * memory, so other views of the same data are unaffected. */
    void slice(int d, int pos) {
        // An optimization for non-device buffers. For the device case,
        // a temp buffer is required, so reuse the not-in-place version.
        // TODO(zalman|abadams): Are nop slices common enough to special
        // case the device part of the if to do nothing?
        if (buf.device_interface != nullptr) {
            *this = sliced(d, pos);
        } else {
            slice_host(d, pos);
        }
    }

    /** Slice a buffer in-place at the dimension's minimum. */
    inline void slice(int d) {
        slice(d, dim(d).min());
    }

    /** Make a new buffer that views this buffer as a single slice in a
     * higher-dimensional space. The new dimension has extent one and
     * the given min. This operation is the opposite of slice. As an
     * example, the following condition is true:
     *
     \code
     im2 = im.embedded(1, 17);
     &im(x, y, c) == &im2(x, 17, y, c);
     \endcode
     */
    Buffer<T, D> embedded(int d, int pos = 0) const {
        Buffer<T, D> im(*this);
        im.embed(d, pos);
        return im;
    }

    /** Embed a buffer in-place, increasing the
     * dimensionality. */
    void embed(int d, int pos = 0) {
        assert(d >= 0 && d <= dimensions());
        add_dimension();
        translate(dimensions() - 1, pos);
        for (int i = dimensions() - 1; i > d; i--) {
            transpose(i, i - 1);
        }
    }

    /** Add a new dimension with a min of zero and an extent of
     * one. The stride is the extent of the outermost dimension times
     * its stride. The new dimension is the last dimension. This is a
     * special case of embed. */
    void add_dimension() {
        const int dims = buf.dimensions;
        buf.dimensions++;
        if (buf.dim != shape) {
            // We're already on the heap. Reallocate.
            halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
            for (int i = 0; i < dims; i++) {
                new_shape[i] = buf.dim[i];
            }
            delete[] buf.dim;
            buf.dim = new_shape;
        } else if (dims == D) {
            // Transition from the in-class storage to the heap
            make_shape_storage(buf.dimensions);
            for (int i = 0; i < dims; i++) {
                buf.dim[i] = shape[i];
            }
        } else {
            // We still fit in the class
        }
        buf.dim[dims] = {0, 1, 0};
        if (dims == 0) {
            buf.dim[dims].stride = 1;
        } else {
            buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
        }
    }

    /** Add a new dimension with a min of zero, an extent of one, and
     * the specified stride. The new dimension is the last
     * dimension. This is a special case of embed. */
    void add_dimension_with_stride(int s) {
        add_dimension();
        buf.dim[buf.dimensions - 1].stride = s;
    }

    /** Methods for managing any GPU allocation. */
    // @{
    // Set the host dirty flag. Called by every operator()
    // access. Must be inlined so it can be hoisted out of loops.
    HALIDE_ALWAYS_INLINE
    void set_host_dirty(bool v = true) {
        assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
        buf.set_host_dirty(v);
    }

    // Check if the device allocation is dirty. Called by
    // set_host_dirty, which is called by every accessor. Must be
    // inlined so it can be hoisted out of loops.
    HALIDE_ALWAYS_INLINE
    bool device_dirty() const {
        return buf.device_dirty();
    }

    bool host_dirty() const {
        return buf.host_dirty();
    }

    void set_device_dirty(bool v = true) {
        assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
        buf.set_device_dirty(v);
    }

    int copy_to_host(void *ctx = nullptr) {
        if (device_dirty()) {
            return buf.device_interface->copy_to_host(ctx, &buf);
        }
        return 0;
    }

    int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        if (host_dirty()) {
            return device_interface->copy_to_device(ctx, &buf, device_interface);
        }
        return 0;
    }

    int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        return device_interface->device_malloc(ctx, &buf, device_interface);
    }

    int device_free(void *ctx = nullptr) {
        if (dev_ref_count) {
            assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
                   "Can't call device_free on an unmanaged or wrapped native device handle. "
                   "Free the source allocation or call device_detach_native instead.");
            // Multiple people may be holding onto this dev field
            assert(dev_ref_count->count == 1 &&
                   "Multiple Halide::Runtime::Buffer objects share this device "
                   "allocation. Freeing it would create dangling references. "
                   "Don't call device_free on Halide buffers that you have copied or "
                   "passed by value.");
        }
        int ret = 0;
        if (buf.device_interface) {
            ret = buf.device_interface->device_free(ctx, &buf);
        }
        if (dev_ref_count) {
            delete dev_ref_count;
            dev_ref_count = nullptr;
        }
        return ret;
    }

    int device_wrap_native(const struct halide_device_interface_t *device_interface,
                           uint64_t handle, void *ctx = nullptr) {
        assert(device_interface);
        dev_ref_count = new DeviceRefCount;
        dev_ref_count->ownership = BufferDeviceOwnership::WrappedNative;
        return device_interface->wrap_native(ctx, &buf, handle, device_interface);
    }

    int device_detach_native(void *ctx = nullptr) {
        assert(dev_ref_count &&
               dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative &&
               "Only call device_detach_native on buffers wrapping a native "
               "device handle via device_wrap_native. This buffer was allocated "
               "using device_malloc, or is unmanaged. "
               "Call device_free or free the original allocation instead.");
        // Multiple people may be holding onto this dev field
        assert(dev_ref_count->count == 1 &&
               "Multiple Halide::Runtime::Buffer objects share this device "
               "allocation. Freeing it could create dangling references. "
               "Don't call device_detach_native on Halide buffers that you "
               "have copied or passed by value.");
        int ret = 0;
        if (buf.device_interface) {
            ret = buf.device_interface->detach_native(ctx, &buf);
        }
        delete dev_ref_count;
        dev_ref_count = nullptr;
        return ret;
    }

    int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
    }

    int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        if (dev_ref_count) {
            assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost &&
                   "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
                   "Free the source allocation or call device_detach_native instead.");
            // Multiple people may be holding onto this dev field
            assert(dev_ref_count->count == 1 &&
                   "Multiple Halide::Runtime::Buffer objects share this device "
                   "allocation. Freeing it would create dangling references. "
                   "Don't call device_and_host_free on Halide buffers that you have copied or "
                   "passed by value.");
        }
        int ret = 0;
        if (buf.device_interface) {
            ret = buf.device_interface->device_and_host_free(ctx, &buf);
        }
        if (dev_ref_count) {
            delete dev_ref_count;
            dev_ref_count = nullptr;
        }
        return ret;
    }

    int device_sync(void *ctx = nullptr) {
        if (buf.device_interface) {
            return buf.device_interface->device_sync(ctx, &buf);
        } else {
            return 0;
        }
    }

    bool has_device_allocation() const {
        return buf.device != 0;
    }

    /** Return the method by which the device field is managed. */
    BufferDeviceOwnership device_ownership() const {
        if (dev_ref_count == nullptr) {
            return BufferDeviceOwnership::Allocated;
        }
        return dev_ref_count->ownership;
    }
    // @}

    /** If you use the (x, y, c) indexing convention, then Halide
     * Buffers are stored planar by default. This function constructs
     * an interleaved RGB or RGBA image that can still be indexed
     * using (x, y, c). Passing it to a generator requires that the
     * generator has been compiled with support for interleaved (also
     * known as packed or chunky) memory layouts. */
    static Buffer<void, D> make_interleaved(halide_type_t t, int width, int height, int channels) {
        Buffer<void, D> im(t, channels, width, height);
        // Note that this is equivalent to calling transpose({2, 0, 1}),
        // but slightly more efficient.
        im.transpose(0, 1);
        im.transpose(1, 2);
        return im;
    }

    /** If you use the (x, y, c) indexing convention, then Halide
     * Buffers are stored planar by default. This function constructs
     * an interleaved RGB or RGBA image that can still be indexed
     * using (x, y, c). Passing it to a generator requires that the
     * generator has been compiled with support for interleaved (also
     * known as packed or chunky) memory layouts. */
    static Buffer<T, D> make_interleaved(int width, int height, int channels) {
        return make_interleaved(static_halide_type(), width, height, channels);
    }

    /** Wrap an existing interleaved image. */
    static Buffer<add_const_if_T_is_const<void>, D>
    make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
        Buffer<add_const_if_T_is_const<void>, D> im(t, data, channels, width, height);
        im.transpose(0, 1);
        im.transpose(1, 2);
        return im;
    }

    /** Wrap an existing interleaved image. */
    static Buffer<T, D> make_interleaved(T *data, int width, int height, int channels) {
        return make_interleaved(static_halide_type(), data, width, height, channels);
    }

    /** Make a zero-dimensional Buffer */
    static Buffer<add_const_if_T_is_const<void>, D> make_scalar(halide_type_t t) {
        Buffer<add_const_if_T_is_const<void>, 1> buf(t, 1);
        buf.slice(0, 0);
        return buf;
    }

    /** Make a zero-dimensional Buffer */
    static Buffer<T, D> make_scalar() {
        Buffer<T, 1> buf(1);
        buf.slice(0, 0);
        return buf;
    }

    /** Make a zero-dimensional Buffer that points to non-owned, existing data */
    static Buffer<T, D> make_scalar(T *data) {
        Buffer<T, 1> buf(data, 1);
        buf.slice(0, 0);
        return buf;
    }

    /** Make a buffer with the same shape and memory nesting order as
     * another buffer. It may have a different type. */
    template<typename T2, int D2>
    static Buffer<T, D> make_with_shape_of(Buffer<T2, D2> src,
                                           void *(*allocate_fn)(size_t) = nullptr,
                                           void (*deallocate_fn)(void *) = nullptr) {

        const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>();
        return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
                                                   allocate_fn, deallocate_fn);
    }

private:
    static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
                                              int dimensions,
                                              halide_dimension_t *shape,
                                              void *(*allocate_fn)(size_t),
                                              void (*deallocate_fn)(void *)) {
        // Reorder the dimensions of src to have strides in increasing order
        std::vector<int> swaps;
        for (int i = dimensions - 1; i > 0; i--) {
            for (int j = i; j > 0; j--) {
                if (shape[j - 1].stride > shape[j].stride) {
                    std::swap(shape[j - 1], shape[j]);
                    swaps.push_back(j);
                }
            }
        }

        // Rewrite the strides to be dense (this messes up src, which
        // is why we took it by value).
        for (int i = 0; i < dimensions; i++) {
            if (i == 0) {
                shape[i].stride = 1;
            } else {
                shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
            }
        }

        // Undo the dimension reordering
        while (!swaps.empty()) {
            int j = swaps.back();
            std::swap(shape[j - 1], shape[j]);
            swaps.pop_back();
        }

        // Use an explicit runtime type, and make dst a Buffer<void>, to allow
        // using this method with Buffer<void> for either src or dst.
        Buffer<> dst(dst_type, nullptr, dimensions, shape);
        dst.allocate(allocate_fn, deallocate_fn);

        return dst;
    }

    template<typename... Args>
    HALIDE_ALWAYS_INLINE
        ptrdiff_t
        offset_of(int d, int first, Args... rest) const {
        return offset_of(d + 1, rest...) + this->buf.dim[d].stride * (first - this->buf.dim[d].min);
    }

    HALIDE_ALWAYS_INLINE
    ptrdiff_t offset_of(int d) const {
        return 0;
    }

    template<typename... Args>
    HALIDE_ALWAYS_INLINE
        storage_T *
        address_of(Args... args) const {
        if (T_is_void) {
            return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
        } else {
            return (storage_T *)(this->buf.host) + offset_of(0, args...);
        }
    }

    HALIDE_ALWAYS_INLINE
    ptrdiff_t offset_of(const int *pos) const {
        ptrdiff_t offset = 0;
        for (int i = this->dimensions() - 1; i >= 0; i--) {
            offset += this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
        }
        return offset;
    }

    HALIDE_ALWAYS_INLINE
    storage_T *address_of(const int *pos) const {
        if (T_is_void) {
            return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
        } else {
            return (storage_T *)this->buf.host + offset_of(pos);
        }
    }

public:
    /** Get a pointer to the address of the min coordinate. */
    T *data() const {
        return (T *)(this->buf.host);
    }

    /** Access elements. Use im(...) to get a reference to an element,
     * and use &im(...) to get the address of an element. If you pass
     * fewer arguments than the buffer has dimensions, the rest are
     * treated as their min coordinate. The non-const versions set the
     * host_dirty flag to true.
     */
    //@{
    template<typename... Args,
             typename = typename std::enable_if<AllInts<Args...>::value>::type>
    HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        assert(!device_dirty());
        return *((const not_void_T *)(address_of(first, rest...)));
    }

    HALIDE_ALWAYS_INLINE
    const not_void_T &
    operator()() const {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        assert(!device_dirty());
        return *((const not_void_T *)(data()));
    }

    HALIDE_ALWAYS_INLINE
    const not_void_T &
    operator()(const int *pos) const {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        assert(!device_dirty());
        return *((const not_void_T *)(address_of(pos)));
    }

    template<typename... Args,
             typename = typename std::enable_if<AllInts<Args...>::value>::type>
    HALIDE_ALWAYS_INLINE
        not_void_T &
        operator()(int first, Args... rest) {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        set_host_dirty();
        return *((not_void_T *)(address_of(first, rest...)));
    }

    HALIDE_ALWAYS_INLINE
    not_void_T &
    operator()() {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        set_host_dirty();
        return *((not_void_T *)(data()));
    }

    HALIDE_ALWAYS_INLINE
    not_void_T &
    operator()(const int *pos) {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        set_host_dirty();
        return *((not_void_T *)(address_of(pos)));
    }
    // @}

    /** Tests that all values in this buffer are equal to val. */
    bool all_equal(not_void_T val) const {
        bool all_equal = true;
        for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
        return all_equal;
    }

    Buffer<T, D> &fill(not_void_T val) {
        set_host_dirty();
        for_each_value([=](T &v) { v = val; });
        return *this;
    }

private:
    /** Helper functions for for_each_value. */
    // @{
    template<int N>
    struct for_each_value_task_dim {
        int extent;
        int stride[N];
    };

    // Given an array of strides, and a bunch of pointers to pointers
    // (all of different types), advance the pointers using the
    // strides.
    template<typename Ptr, typename... Ptrs>
    HALIDE_ALWAYS_INLINE static void advance_ptrs(const int *stride, Ptr *ptr, Ptrs... ptrs) {
        (*ptr) += *stride;
        advance_ptrs(stride + 1, ptrs...);
    }

    HALIDE_ALWAYS_INLINE
    static void advance_ptrs(const int *) {
    }

    // Same as the above, but just increments the pointers.
    template<typename Ptr, typename... Ptrs>
    HALIDE_ALWAYS_INLINE static void increment_ptrs(Ptr *ptr, Ptrs... ptrs) {
        (*ptr)++;
        increment_ptrs(ptrs...);
    }

    HALIDE_ALWAYS_INLINE
    static void increment_ptrs() {
    }

    template<typename Fn, typename... Ptrs>
    HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
                                                          const for_each_value_task_dim<sizeof...(Ptrs)> *t, Ptrs... ptrs) {
        if (d == -1) {
            f((*ptrs)...);
        } else if (d == 0) {
            if (innermost_strides_are_one) {
                for (int i = t[0].extent; i != 0; i--) {
                    f((*ptrs)...);
                    increment_ptrs((&ptrs)...);
                }
            } else {
                for (int i = t[0].extent; i != 0; i--) {
                    f((*ptrs)...);
                    advance_ptrs(t[0].stride, (&ptrs)...);
                }
            }
        } else {
            for (int i = t[d].extent; i != 0; i--) {
                for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptrs...);
                advance_ptrs(t[d].stride, (&ptrs)...);
            }
        }
    }

    template<int N>
    HALIDE_NEVER_INLINE static bool for_each_value_prep(for_each_value_task_dim<N> *t,
                                                        const halide_buffer_t **buffers) {
        // Check the buffers all have clean host allocations
        for (int i = 0; i < N; i++) {
            if (buffers[i]->device) {
                assert(buffers[i]->host &&
                       "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
                assert(!buffers[i]->device_dirty() &&
                       "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
            } else {
                assert(buffers[i]->host &&
                       "Buffer passed to for_each_value has no host or device allocation");
            }
        }

        const int dimensions = buffers[0]->dimensions;

        // Extract the strides in all the dimensions
        for (int i = 0; i < dimensions; i++) {
            for (int j = 0; j < N; j++) {
                assert(buffers[j]->dimensions == dimensions);
                assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
                       buffers[j]->dim[i].min == buffers[0]->dim[i].min);
                const int s = buffers[j]->dim[i].stride;
                t[i].stride[j] = s;
            }
            t[i].extent = buffers[0]->dim[i].extent;

            // Order the dimensions by stride, so that the traversal is cache-coherent.
            for (int j = i; j > 0 && t[j].stride[0] < t[j - 1].stride[0]; j--) {
                std::swap(t[j], t[j - 1]);
            }
        }

        // flatten dimensions where possible to make a larger inner
        // loop for autovectorization.
        int d = dimensions;
        for (int i = 1; i < d; i++) {
            bool flat = true;
            for (int j = 0; j < N; j++) {
                flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
            }
            if (flat) {
                t[i - 1].extent *= t[i].extent;
                for (int j = i; j < d; j++) {
                    t[j] = t[j + 1];
                }
                i--;
                d--;
                t[d].extent = 1;
            }
        }

        bool innermost_strides_are_one = true;
        if (dimensions > 0) {
            for (int i = 0; i < N; i++) {
                innermost_strides_are_one &= (t[0].stride[i] == 1);
            }
        }

        return innermost_strides_are_one;
    }

    template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
    void for_each_value_impl(Fn &&f, Args &&... other_buffers) const {
        Buffer<>::for_each_value_task_dim<N> *t =
            (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA((dimensions() + 1) * sizeof(for_each_value_task_dim<N>));
        // Move the preparatory code into a non-templated helper to
        // save code size.
        const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
        bool innermost_strides_are_one = Buffer<>::for_each_value_prep(t, buffers);

        Buffer<>::for_each_value_helper(f, dimensions() - 1,
                                        innermost_strides_are_one,
                                        t,
                                        data(), (other_buffers.data())...);
    }
    // @}

public:
    /** Call a function on every value in the buffer, and the
     * corresponding values in some number of other buffers of the
     * same size. The function should take a reference, const
     * reference, or value of the correct type for each buffer. This
     * effectively lifts a function of scalars to an element-wise
     * function of buffers. This produces code that the compiler can
     * autovectorize. This is slightly cheaper than for_each_element,
     * because it does not need to track the coordinates.
     *
     * Note that constness of Buffers is preserved: a const Buffer<T> (for either
     * 'this' or the other-buffers arguments) will allow mutation of the
     * buffer contents, while a Buffer<const T> will not. Attempting to specify
     * a mutable reference for the lambda argument of a Buffer<const T>
     * will result in a compilation error. */
    // @{
    template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
    HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_value(Fn &&f, Args &&... other_buffers) const {
        for_each_value_impl(f, std::forward<Args>(other_buffers)...);
        return *this;
    }

    template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
    HALIDE_ALWAYS_INLINE
        Buffer<T, D> &
        for_each_value(Fn &&f, Args &&... other_buffers) {
        for_each_value_impl(f, std::forward<Args>(other_buffers)...);
        return *this;
    }
    // @}

private:
    // Helper functions for for_each_element
    struct for_each_element_task_dim {
        int min, max;
    };

    /** If f is callable with this many args, call it. The first
     * argument is just to make the overloads distinct. Actual
     * overload selection is done using the enable_if. */
    template<typename Fn,
             typename... Args,
             typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
    HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
        f(args...);
    }

    /** If the above overload is impossible, we add an outer loop over
     * an additional argument and try again. */
    template<typename Fn,
             typename... Args>
    HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
        for (int i = t[d].min; i <= t[d].max; i++) {
            for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
        }
    }

    /** Determine the minimum number of arguments a callable can take
     * using the same trick. */
    template<typename Fn,
             typename... Args,
             typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
    HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
        return (int)(sizeof...(Args));
    }

    /** The recursive version is only enabled up to a recursion limit
     * of 256. This catches callables that aren't callable with any
     * number of ints. */
    template<typename Fn,
             typename... Args>
    HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
        static_assert(sizeof...(args) <= 256,
                      "Callable passed to for_each_element must accept either a const int *,"
                      " or up to 256 ints. No such operator found. Expect infinite template recursion.");
        return num_args(0, std::forward<Fn>(f), 0, args...);
    }

    /** A version where the callable takes a position array instead,
     * with compile-time recursion on the dimensionality.  This
     * overload is preferred to the one below using the same int vs
     * double trick as above, but is impossible once d hits -1 using
     * std::enable_if. */
    template<int d,
             typename Fn,
             typename = typename std::enable_if<(d >= 0)>::type>
    HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
        for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
            for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos);
        }
    }

    /** Base case for recursion above. */
    template<int d,
             typename Fn,
             typename = typename std::enable_if<(d < 0)>::type>
    HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
        f(pos);
    }

    /** A run-time-recursive version (instead of
     * compile-time-recursive) that requires the callable to take a
     * pointer to a position array instead. Dispatches to the
     * compile-time-recursive version once the dimensionality gets
     * small. */
    template<typename Fn>
    static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
        if (d == -1) {
            f(pos);
        } else if (d == 0) {
            // Once the dimensionality gets small enough, dispatch to
            // a compile-time-recursive version for better codegen of
            // the inner loops.
            for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
        } else if (d == 1) {
            for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
        } else if (d == 2) {
            for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
        } else if (d == 3) {
            for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
        } else {
            for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
                for_each_element_array(d - 1, t, std::forward<Fn>(f), pos);
            }
        }
    }

    /** We now have two overloads for for_each_element. This one
     * triggers if the callable takes a const int *.
     */
    template<typename Fn,
             typename = decltype(std::declval<Fn>()((const int *)nullptr))>
    static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
        int *pos = (int *)HALIDE_ALLOCA(dims * sizeof(int));
        for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
    }

    /** This one triggers otherwise. It treats the callable as
     * something that takes some number of ints. */
    template<typename Fn>
    HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
        int args = num_args(0, std::forward<Fn>(f));
        assert(dims >= args);
        for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
    }

    template<typename Fn>
    void for_each_element_impl(Fn &&f) const {
        for_each_element_task_dim *t =
            (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
        for (int i = 0; i < dimensions(); i++) {
            t[i].min = dim(i).min();
            t[i].max = dim(i).max();
        }
        for_each_element(0, dimensions(), t, std::forward<Fn>(f));
    }

public:
    /** Call a function at each site in a buffer. This is likely to be
     * much slower than using Halide code to populate a buffer, but is
     * convenient for tests. If the function has more arguments than the
     * buffer has dimensions, the remaining arguments will be zero. If it
     * has fewer arguments than the buffer has dimensions then the last
     * few dimensions of the buffer are not iterated over. For example,
     * the following code exploits this to set a floating point RGB image
     * to red:

     \code
     Buffer<float, 3> im(100, 100, 3);
     im.for_each_element([&](int x, int y) {
         im(x, y, 0) = 1.0f;
         im(x, y, 1) = 0.0f;
         im(x, y, 2) = 0.0f:
     });
     \endcode

     * The compiled code is equivalent to writing the a nested for loop,
     * and compilers are capable of optimizing it in the same way.
     *
     * If the callable can be called with an int * as the sole argument,
     * that version is called instead. Each location in the buffer is
     * passed to it in a coordinate array. This version is higher-overhead
     * than the variadic version, but is useful for writing generic code
     * that accepts buffers of arbitrary dimensionality. For example, the
     * following sets the value at all sites in an arbitrary-dimensional
     * buffer to their first coordinate:

     \code
     im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
     \endcode

     * It is also possible to use for_each_element to iterate over entire
     * rows or columns by cropping the buffer to a single column or row
     * respectively and iterating over elements of the result. For example,
     * to set the diagonal of the image to 1 by iterating over the columns:

     \code
     Buffer<float, 3> im(100, 100, 3);
         im.sliced(1, 0).for_each_element([&](int x, int c) {
         im(x, x, c) = 1.0f;
     });
     \endcode

     * Or, assuming the memory layout is known to be dense per row, one can
     * memset each row of an image like so:

     \code
     Buffer<float, 3> im(100, 100, 3);
     im.sliced(0, 0).for_each_element([&](int y, int c) {
         memset(&im(0, y, c), 0, sizeof(float) * im.width());
     });
     \endcode

    */
    // @{
    template<typename Fn>
    HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_element(Fn &&f) const {
        for_each_element_impl(f);
        return *this;
    }

    template<typename Fn>
    HALIDE_ALWAYS_INLINE
        Buffer<T, D> &
        for_each_element(Fn &&f) {
        for_each_element_impl(f);
        return *this;
    }
    // @}

private:
    template<typename Fn>
    struct FillHelper {
        Fn f;
        Buffer<T, D> *buf;

        template<typename... Args,
                 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
        void operator()(Args... args) {
            (*buf)(args...) = f(args...);
        }

        FillHelper(Fn &&f, Buffer<T, D> *buf)
            : f(std::forward<Fn>(f)), buf(buf) {
        }
    };

public:
    /** Fill a buffer by evaluating a callable at every site. The
     * callable should look much like a callable passed to
     * for_each_element, but it should return the value that should be
     * stored to the coordinate corresponding to the arguments. */
    template<typename Fn,
             typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type>
    Buffer<T, D> &fill(Fn &&f) {
        // We'll go via for_each_element. We need a variadic wrapper lambda.
        FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
        return for_each_element(wrapper);
    }

    /** Check if an input buffer passed extern stage is a querying
     * bounds. Compared to doing the host pointer check directly,
     * this both adds clarity to code and will facilitate moving to
     * another representation for bounds query arguments. */
    bool is_bounds_query() const {
        return buf.is_bounds_query();
    }

    /** Convenient check to verify that all of the interesting bytes in the Buffer
     * are initialized under MSAN. Note that by default, we use for_each_value() here so that
     * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
     * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
     * the entire Buffer storage.) */
    void msan_check_mem_is_initialized(bool entire = false) const {
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
        if (entire) {
            __msan_check_mem_is_initialized(data(), size_in_bytes());
        } else {
            for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
        }
#endif
#endif
    }
};

}  // namespace Runtime
}  // namespace Halide

#undef HALIDE_ALLOCA

#endif  // HALIDE_RUNTIME_IMAGE_H