1 /** \file 2 * Defines a Buffer type that wraps from halide_buffer_t and adds 3 * functionality, and methods for more conveniently iterating over the 4 * samples in a halide_buffer_t outside of Halide code. */ 5 6 #ifndef HALIDE_RUNTIME_BUFFER_H 7 #define HALIDE_RUNTIME_BUFFER_H 8 9 #include <algorithm> 10 #include <atomic> 11 #include <cassert> 12 #include <limits> 13 #include <memory> 14 #include <stdint.h> 15 #include <string.h> 16 #include <vector> 17 18 #if defined(__has_feature) 19 #if __has_feature(memory_sanitizer) 20 #include <sanitizer/msan_interface.h> 21 #endif 22 #endif 23 24 #include "HalideRuntime.h" 25 26 #ifdef _MSC_VER 27 #include <malloc.h> 28 #define HALIDE_ALLOCA _alloca 29 #else 30 #define HALIDE_ALLOCA __builtin_alloca 31 #endif 32 33 // gcc 5.1 has a false positive warning on this code 34 #if __GNUC__ == 5 && __GNUC_MINOR__ == 1 35 #pragma GCC diagnostic ignored "-Warray-bounds" 36 #endif 37 38 namespace Halide { 39 namespace Runtime { 40 41 // Forward-declare our Buffer class 42 template<typename T, int D> 43 class Buffer; 44 45 // A helper to check if a parameter pack is entirely implicitly 46 // int-convertible to use with std::enable_if 47 template<typename... Args> 48 struct AllInts : std::false_type {}; 49 50 template<> 51 struct AllInts<> : std::true_type {}; 52 53 template<typename T, typename... Args> 54 struct AllInts<T, Args...> { 55 static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value; 56 }; 57 58 // Floats and doubles are technically implicitly int-convertible, but 59 // doing so produces a warning we treat as an error, so just disallow 60 // it here. 61 template<typename... Args> 62 struct AllInts<float, Args...> : std::false_type {}; 63 64 template<typename... Args> 65 struct AllInts<double, Args...> : std::false_type {}; 66 67 // A helper to detect if there are any zeros in a container 68 namespace Internal { 69 template<typename Container> 70 bool any_zero(const Container &c) { 71 for (int i : c) { 72 if (i == 0) return true; 73 } 74 return false; 75 } 76 } // namespace Internal 77 78 /** A struct acting as a header for allocations owned by the Buffer 79 * class itself. */ 80 struct AllocationHeader { 81 void (*deallocate_fn)(void *); 82 std::atomic<int> ref_count; 83 84 // Note that ref_count always starts at 1 85 AllocationHeader(void (*deallocate_fn)(void *)) 86 : deallocate_fn(deallocate_fn), ref_count(1) { 87 } 88 }; 89 90 /** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */ 91 enum struct BufferDeviceOwnership : int { 92 Allocated, ///> halide_device_free will be called when device ref count goes to zero 93 WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero 94 Unmanaged, ///> No free routine will be called when device ref count goes to zero 95 AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero. 96 Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero. 97 }; 98 99 /** A similar struct for managing device allocations. */ 100 struct DeviceRefCount { 101 // This is only ever constructed when there's something to manage, 102 // so start at one. 103 std::atomic<int> count{1}; 104 BufferDeviceOwnership ownership{BufferDeviceOwnership::Allocated}; 105 }; 106 107 /** A templated Buffer class that wraps halide_buffer_t and adds 108 * functionality. When using Halide from C++, this is the preferred 109 * way to create input and output buffers. The overhead of using this 110 * class relative to a naked halide_buffer_t is minimal - it uses another 111 * ~16 bytes on the stack, and does no dynamic allocations when using 112 * it to represent existing memory of a known maximum dimensionality. 113 * 114 * The template parameter T is the element type. For buffers where the 115 * element type is unknown, or may vary, use void or const void. 116 * 117 * D is the maximum number of dimensions that can be represented using 118 * space inside the class itself. Set it to the maximum dimensionality 119 * you expect this buffer to be. If the actual dimensionality exceeds 120 * this, heap storage is allocated to track the shape of the buffer. D 121 * defaults to 4, which should cover nearly all usage. 122 * 123 * The class optionally allocates and owns memory for the image using 124 * a shared pointer allocated with the provided allocator. If they are 125 * null, malloc and free are used. Any device-side allocation is 126 * considered as owned if and only if the host-side allocation is 127 * owned. */ 128 template<typename T = void, int D = 4> 129 class Buffer { 130 /** The underlying halide_buffer_t */ 131 halide_buffer_t buf = {0}; 132 133 /** Some in-class storage for shape of the dimensions. */ 134 halide_dimension_t shape[D]; 135 136 /** The allocation owned by this Buffer. NULL if the Buffer does not 137 * own the memory. */ 138 AllocationHeader *alloc = nullptr; 139 140 /** A reference count for the device allocation owned by this 141 * buffer. */ 142 mutable DeviceRefCount *dev_ref_count = nullptr; 143 144 /** True if T is of type void or const void */ 145 static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value; 146 147 /** A type function that adds a const qualifier if T is a const type. */ 148 template<typename T2> 149 using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type; 150 151 /** T unless T is (const) void, in which case (const) 152 * uint8_t. Useful for providing return types for operator() */ 153 using not_void_T = typename std::conditional<T_is_void, 154 add_const_if_T_is_const<uint8_t>, 155 T>::type; 156 157 /** T with constness removed. Useful for return type of copy(). */ 158 using not_const_T = typename std::remove_const<T>::type; 159 160 /** The type the elements are stored as. Equal to not_void_T 161 * unless T is a pointer, in which case uint64_t. Halide stores 162 * all pointer types as uint64s internally, even on 32-bit 163 * systems. */ 164 using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type; 165 166 public: 167 /** True if the Halide type is not void (or const void). */ 168 static constexpr bool has_static_halide_type = !T_is_void; 169 170 /** Get the Halide type of T. Callers should not use the result if 171 * has_static_halide_type is false. */ 172 static halide_type_t static_halide_type() { 173 return halide_type_of<typename std::remove_cv<not_void_T>::type>(); 174 } 175 176 /** Does this Buffer own the host memory it refers to? */ 177 bool owns_host_memory() const { 178 return alloc != nullptr; 179 } 180 181 private: 182 /** Increment the reference count of any owned allocation */ 183 void incref() const { 184 if (owns_host_memory()) { 185 alloc->ref_count++; 186 } 187 if (buf.device) { 188 if (!dev_ref_count) { 189 // I seem to have a non-zero dev field but no 190 // reference count for it. I must have been given a 191 // device allocation by a Halide pipeline, and have 192 // never been copied from since. Take sole ownership 193 // of it. 194 dev_ref_count = new DeviceRefCount; 195 } 196 dev_ref_count->count++; 197 } 198 } 199 200 // Note that this is called "cropped" but can also encompass a slice/embed 201 // operation as well. 202 struct DevRefCountCropped : DeviceRefCount { 203 Buffer<T, D> cropped_from; 204 DevRefCountCropped(const Buffer<T, D> &cropped_from) 205 : cropped_from(cropped_from) { 206 ownership = BufferDeviceOwnership::Cropped; 207 } 208 }; 209 210 /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */ 211 void crop_from(const Buffer<T, D> &cropped_from) { 212 assert(dev_ref_count == nullptr); 213 dev_ref_count = new DevRefCountCropped(cropped_from); 214 } 215 216 /** Decrement the reference count of any owned allocation and free host 217 * and device memory if it hits zero. Sets alloc to nullptr. */ 218 void decref() { 219 if (owns_host_memory()) { 220 int new_count = --(alloc->ref_count); 221 if (new_count == 0) { 222 void (*fn)(void *) = alloc->deallocate_fn; 223 alloc->~AllocationHeader(); 224 fn(alloc); 225 } 226 buf.host = nullptr; 227 alloc = nullptr; 228 set_host_dirty(false); 229 } 230 decref_dev(); 231 } 232 233 void decref_dev() { 234 int new_count = 0; 235 if (dev_ref_count) { 236 new_count = --(dev_ref_count->count); 237 } 238 if (new_count == 0) { 239 if (buf.device) { 240 assert(!(alloc && device_dirty()) && 241 "Implicitly freeing a dirty device allocation while a host allocation still lives. " 242 "Call device_free explicitly if you want to drop dirty device-side data. " 243 "Call copy_to_host explicitly if you want the data copied to the host allocation " 244 "before the device allocation is freed."); 245 if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) { 246 buf.device_interface->detach_native(nullptr, &buf); 247 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) { 248 buf.device_interface->device_and_host_free(nullptr, &buf); 249 } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { 250 buf.device_interface->device_release_crop(nullptr, &buf); 251 } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) { 252 buf.device_interface->device_free(nullptr, &buf); 253 } 254 } 255 if (dev_ref_count) { 256 if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { 257 delete (DevRefCountCropped *)dev_ref_count; 258 } else { 259 delete dev_ref_count; 260 } 261 } 262 } 263 buf.device = 0; 264 buf.device_interface = nullptr; 265 dev_ref_count = nullptr; 266 } 267 268 void free_shape_storage() { 269 if (buf.dim != shape) { 270 delete[] buf.dim; 271 buf.dim = nullptr; 272 } 273 } 274 275 void make_shape_storage(const int dimensions) { 276 // This should usually be inlined, so if dimensions is statically known, 277 // we can skip the call to new 278 buf.dimensions = dimensions; 279 buf.dim = (dimensions <= D) ? shape : new halide_dimension_t[dimensions]; 280 } 281 282 void copy_shape_from(const halide_buffer_t &other) { 283 // All callers of this ensure that buf.dimensions == other.dimensions. 284 make_shape_storage(other.dimensions); 285 std::copy(other.dim, other.dim + other.dimensions, buf.dim); 286 } 287 288 template<typename T2, int D2> 289 void move_shape_from(Buffer<T2, D2> &&other) { 290 if (other.shape == other.buf.dim) { 291 copy_shape_from(other.buf); 292 } else { 293 buf.dim = other.buf.dim; 294 other.buf.dim = nullptr; 295 } 296 } 297 298 /** Initialize the shape from a halide_buffer_t. */ 299 void initialize_from_buffer(const halide_buffer_t &b, 300 BufferDeviceOwnership ownership) { 301 memcpy(&buf, &b, sizeof(halide_buffer_t)); 302 copy_shape_from(b); 303 if (b.device) { 304 dev_ref_count = new DeviceRefCount; 305 dev_ref_count->ownership = ownership; 306 } 307 } 308 309 /** Initialize the shape from an array of ints */ 310 void initialize_shape(const int *sizes) { 311 for (int i = 0; i < buf.dimensions; i++) { 312 buf.dim[i].min = 0; 313 buf.dim[i].extent = sizes[i]; 314 if (i == 0) { 315 buf.dim[i].stride = 1; 316 } else { 317 buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent; 318 } 319 } 320 } 321 322 /** Initialize the shape from a vector of extents */ 323 void initialize_shape(const std::vector<int> &sizes) { 324 assert(buf.dimensions == (int)sizes.size()); 325 initialize_shape(sizes.data()); 326 } 327 328 /** Initialize the shape from the static shape of an array */ 329 template<typename Array, size_t N> 330 void initialize_shape_from_array_shape(int next, Array (&vals)[N]) { 331 buf.dim[next].min = 0; 332 buf.dim[next].extent = (int)N; 333 if (next == 0) { 334 buf.dim[next].stride = 1; 335 } else { 336 initialize_shape_from_array_shape(next - 1, vals[0]); 337 buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent; 338 } 339 } 340 341 /** Base case for the template recursion above. */ 342 template<typename T2> 343 void initialize_shape_from_array_shape(int, const T2 &) { 344 } 345 346 /** Get the dimensionality of a multi-dimensional C array */ 347 template<typename Array, size_t N> 348 static int dimensionality_of_array(Array (&vals)[N]) { 349 return dimensionality_of_array(vals[0]) + 1; 350 } 351 352 template<typename T2> 353 static int dimensionality_of_array(const T2 &) { 354 return 0; 355 } 356 357 /** Get the underlying halide_type_t of an array's element type. */ 358 template<typename Array, size_t N> 359 static halide_type_t scalar_type_of_array(Array (&vals)[N]) { 360 return scalar_type_of_array(vals[0]); 361 } 362 363 template<typename T2> 364 static halide_type_t scalar_type_of_array(const T2 &) { 365 return halide_type_of<typename std::remove_cv<T2>::type>(); 366 } 367 368 /** Crop a single dimension without handling device allocation. */ 369 void crop_host(int d, int min, int extent) { 370 assert(dim(d).min() <= min); 371 assert(dim(d).max() >= min + extent - 1); 372 int shift = min - dim(d).min(); 373 if (buf.host != nullptr) { 374 buf.host += shift * dim(d).stride() * type().bytes(); 375 } 376 buf.dim[d].min = min; 377 buf.dim[d].extent = extent; 378 } 379 380 /** Crop as many dimensions as are in rect, without handling device allocation. */ 381 void crop_host(const std::vector<std::pair<int, int>> &rect) { 382 assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max())); 383 int limit = (int)rect.size(); 384 assert(limit <= dimensions()); 385 for (int i = 0; i < limit; i++) { 386 crop_host(i, rect[i].first, rect[i].second); 387 } 388 } 389 390 void complete_device_crop(Buffer<T, D> &result_host_cropped) const { 391 assert(buf.device_interface != nullptr); 392 if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == 0) { 393 const Buffer<T, D> *cropped_from = this; 394 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here? 395 // is it possible to get to this point without incref having run at least once since 396 // the device field was set? (I.e. in the internal logic of crop. incref might have been 397 // called.) 398 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { 399 cropped_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from; 400 } 401 result_host_cropped.crop_from(*cropped_from); 402 } 403 } 404 405 /** slice a single dimension without handling device allocation. */ 406 void slice_host(int d, int pos) { 407 assert(d >= 0 && d < dimensions()); 408 assert(pos >= dim(d).min() && pos <= dim(d).max()); 409 buf.dimensions--; 410 int shift = pos - buf.dim[d].min; 411 if (buf.host != nullptr) { 412 buf.host += shift * buf.dim[d].stride * type().bytes(); 413 } 414 for (int i = d; i < buf.dimensions; i++) { 415 buf.dim[i] = buf.dim[i + 1]; 416 } 417 buf.dim[buf.dimensions] = {0, 0, 0}; 418 } 419 420 void complete_device_slice(Buffer<T, D> &result_host_sliced, int d, int pos) const { 421 assert(buf.device_interface != nullptr); 422 if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == 0) { 423 const Buffer<T, D> *sliced_from = this; 424 // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here? 425 // is it possible to get to this point without incref having run at least once since 426 // the device field was set? (I.e. in the internal logic of slice. incref might have been 427 // called.) 428 if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) { 429 sliced_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from; 430 } 431 // crop_from() is correct here, despite the fact that we are slicing. 432 result_host_sliced.crop_from(*sliced_from); 433 } 434 } 435 436 public: 437 typedef T ElemType; 438 439 /** Read-only access to the shape */ 440 class Dimension { 441 const halide_dimension_t &d; 442 443 public: 444 /** The lowest coordinate in this dimension */ 445 HALIDE_ALWAYS_INLINE int min() const { 446 return d.min; 447 } 448 449 /** The number of elements in memory you have to step over to 450 * increment this coordinate by one. */ 451 HALIDE_ALWAYS_INLINE int stride() const { 452 return d.stride; 453 } 454 455 /** The extent of the image along this dimension */ 456 HALIDE_ALWAYS_INLINE int extent() const { 457 return d.extent; 458 } 459 460 /** The highest coordinate in this dimension */ 461 HALIDE_ALWAYS_INLINE int max() const { 462 return min() + extent() - 1; 463 } 464 465 /** An iterator class, so that you can iterate over 466 * coordinates in a dimensions using a range-based for loop. */ 467 struct iterator { 468 int val; 469 int operator*() const { 470 return val; 471 } 472 bool operator!=(const iterator &other) const { 473 return val != other.val; 474 } 475 iterator &operator++() { 476 val++; 477 return *this; 478 } 479 }; 480 481 /** An iterator that points to the min coordinate */ 482 HALIDE_ALWAYS_INLINE iterator begin() const { 483 return {min()}; 484 } 485 486 /** An iterator that points to one past the max coordinate */ 487 HALIDE_ALWAYS_INLINE iterator end() const { 488 return {min() + extent()}; 489 } 490 491 Dimension(const halide_dimension_t &dim) 492 : d(dim){}; 493 }; 494 495 /** Access the shape of the buffer */ 496 HALIDE_ALWAYS_INLINE Dimension dim(int i) const { 497 assert(i >= 0 && i < this->dimensions()); 498 return Dimension(buf.dim[i]); 499 } 500 501 /** Access to the mins, strides, extents. Will be deprecated. Do not use. */ 502 // @{ 503 int min(int i) const { 504 return dim(i).min(); 505 } 506 int extent(int i) const { 507 return dim(i).extent(); 508 } 509 int stride(int i) const { 510 return dim(i).stride(); 511 } 512 // @} 513 514 /** The total number of elements this buffer represents. Equal to 515 * the product of the extents */ 516 size_t number_of_elements() const { 517 size_t s = 1; 518 for (int i = 0; i < dimensions(); i++) { 519 s *= dim(i).extent(); 520 } 521 return s; 522 } 523 524 /** Get the dimensionality of the buffer. */ 525 int dimensions() const { 526 return buf.dimensions; 527 } 528 529 /** Get the type of the elements. */ 530 halide_type_t type() const { 531 return buf.type; 532 } 533 534 private: 535 /** Offset to the element with the lowest address. If all 536 * strides are positive, equal to zero. Offset is in elements, not bytes. */ 537 ptrdiff_t begin_offset() const { 538 ptrdiff_t index = 0; 539 for (int i = 0; i < dimensions(); i++) { 540 if (dim(i).stride() < 0) { 541 index += dim(i).stride() * (dim(i).extent() - 1); 542 } 543 } 544 return index; 545 } 546 547 /** An offset to one beyond the element with the highest address. 548 * Offset is in elements, not bytes. */ 549 ptrdiff_t end_offset() const { 550 ptrdiff_t index = 0; 551 for (int i = 0; i < dimensions(); i++) { 552 if (dim(i).stride() > 0) { 553 index += dim(i).stride() * (dim(i).extent() - 1); 554 } 555 } 556 index += 1; 557 return index; 558 } 559 560 public: 561 /** A pointer to the element with the lowest address. If all 562 * strides are positive, equal to the host pointer. */ 563 T *begin() const { 564 assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer. 565 return (T *)(buf.host + begin_offset() * type().bytes()); 566 } 567 568 /** A pointer to one beyond the element with the highest address. */ 569 T *end() const { 570 assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer. 571 return (T *)(buf.host + end_offset() * type().bytes()); 572 } 573 574 /** The total number of bytes spanned by the data in memory. */ 575 size_t size_in_bytes() const { 576 return (size_t)(end_offset() - begin_offset()) * type().bytes(); 577 } 578 579 /** Reset the Buffer to be equivalent to a default-constructed Buffer 580 * of the same static type (if any); Buffer<void> will have its runtime 581 * type reset to uint8. */ 582 void reset() { 583 *this = Buffer(); 584 } 585 586 Buffer() 587 : shape() { 588 buf.type = static_halide_type(); 589 make_shape_storage(0); 590 } 591 592 /** Make a Buffer from a halide_buffer_t */ 593 explicit Buffer(const halide_buffer_t &buf, 594 BufferDeviceOwnership ownership = BufferDeviceOwnership::Unmanaged) { 595 assert(T_is_void || buf.type == static_halide_type()); 596 initialize_from_buffer(buf, ownership); 597 } 598 599 /** Give Buffers access to the members of Buffers of different dimensionalities and types. */ 600 template<typename T2, int D2> 601 friend class Buffer; 602 603 private: 604 template<typename T2, int D2> 605 static void static_assert_can_convert_from() { 606 static_assert((!std::is_const<T2>::value || std::is_const<T>::value), 607 "Can't convert from a Buffer<const T> to a Buffer<T>"); 608 static_assert(std::is_same<typename std::remove_const<T>::type, 609 typename std::remove_const<T2>::type>::value || 610 T_is_void || Buffer<T2, D2>::T_is_void, 611 "type mismatch constructing Buffer"); 612 } 613 614 public: 615 /** Determine if if an Buffer<T, D> can be constructed from some other Buffer type. 616 * If this can be determined at compile time, fail with a static assert; otherwise 617 * return a boolean based on runtime typing. */ 618 template<typename T2, int D2> 619 static bool can_convert_from(const Buffer<T2, D2> &other) { 620 static_assert_can_convert_from<T2, D2>(); 621 if (Buffer<T2, D2>::T_is_void && !T_is_void) { 622 return other.type() == static_halide_type(); 623 } 624 return true; 625 } 626 627 /** Fail an assertion at runtime or compile-time if an Buffer<T, D> 628 * cannot be constructed from some other Buffer type. */ 629 template<typename T2, int D2> 630 static void assert_can_convert_from(const Buffer<T2, D2> &other) { 631 // Explicitly call static_assert_can_convert_from() here so 632 // that we always get compile-time checking, even if compiling with 633 // assertions disabled. 634 static_assert_can_convert_from<T2, D2>(); 635 assert(can_convert_from(other)); 636 } 637 638 /** Copy constructor. Does not copy underlying data. */ 639 Buffer(const Buffer<T, D> &other) 640 : buf(other.buf), 641 alloc(other.alloc) { 642 other.incref(); 643 dev_ref_count = other.dev_ref_count; 644 copy_shape_from(other.buf); 645 } 646 647 /** Construct a Buffer from a Buffer of different dimensionality 648 * and type. Asserts that the type matches (at runtime, if one of 649 * the types is void). Note that this constructor is 650 * implicit. This, for example, lets you pass things like 651 * Buffer<T> or Buffer<const void> to functions expected 652 * Buffer<const T>. */ 653 template<typename T2, int D2> 654 Buffer(const Buffer<T2, D2> &other) 655 : buf(other.buf), 656 alloc(other.alloc) { 657 assert_can_convert_from(other); 658 other.incref(); 659 dev_ref_count = other.dev_ref_count; 660 copy_shape_from(other.buf); 661 } 662 663 /** Move constructor */ 664 Buffer(Buffer<T, D> &&other) noexcept 665 : buf(other.buf), 666 alloc(other.alloc), 667 dev_ref_count(other.dev_ref_count) { 668 other.dev_ref_count = nullptr; 669 other.alloc = nullptr; 670 move_shape_from(std::forward<Buffer<T, D>>(other)); 671 other.buf = halide_buffer_t(); 672 } 673 674 /** Move-construct a Buffer from a Buffer of different 675 * dimensionality and type. Asserts that the types match (at 676 * runtime if one of the types is void). */ 677 template<typename T2, int D2> 678 Buffer(Buffer<T2, D2> &&other) 679 : buf(other.buf), 680 alloc(other.alloc), 681 dev_ref_count(other.dev_ref_count) { 682 assert_can_convert_from(other); 683 other.dev_ref_count = nullptr; 684 other.alloc = nullptr; 685 move_shape_from(std::forward<Buffer<T2, D2>>(other)); 686 other.buf = halide_buffer_t(); 687 } 688 689 /** Assign from another Buffer of possibly-different 690 * dimensionality and type. Asserts that the types match (at 691 * runtime if one of the types is void). */ 692 template<typename T2, int D2> 693 Buffer<T, D> &operator=(const Buffer<T2, D2> &other) { 694 if ((const void *)this == (const void *)&other) { 695 return *this; 696 } 697 assert_can_convert_from(other); 698 other.incref(); 699 decref(); 700 dev_ref_count = other.dev_ref_count; 701 alloc = other.alloc; 702 free_shape_storage(); 703 buf = other.buf; 704 copy_shape_from(other.buf); 705 return *this; 706 } 707 708 /** Standard assignment operator */ 709 Buffer<T, D> &operator=(const Buffer<T, D> &other) { 710 if (this == &other) { 711 return *this; 712 } 713 other.incref(); 714 decref(); 715 dev_ref_count = other.dev_ref_count; 716 alloc = other.alloc; 717 free_shape_storage(); 718 buf = other.buf; 719 copy_shape_from(other.buf); 720 return *this; 721 } 722 723 /** Move from another Buffer of possibly-different 724 * dimensionality and type. Asserts that the types match (at 725 * runtime if one of the types is void). */ 726 template<typename T2, int D2> 727 Buffer<T, D> &operator=(Buffer<T2, D2> &&other) { 728 assert_can_convert_from(other); 729 decref(); 730 alloc = other.alloc; 731 other.alloc = nullptr; 732 dev_ref_count = other.dev_ref_count; 733 other.dev_ref_count = nullptr; 734 free_shape_storage(); 735 buf = other.buf; 736 move_shape_from(std::forward<Buffer<T2, D2>>(other)); 737 other.buf = halide_buffer_t(); 738 return *this; 739 } 740 741 /** Standard move-assignment operator */ 742 Buffer<T, D> &operator=(Buffer<T, D> &&other) noexcept { 743 decref(); 744 alloc = other.alloc; 745 other.alloc = nullptr; 746 dev_ref_count = other.dev_ref_count; 747 other.dev_ref_count = nullptr; 748 free_shape_storage(); 749 buf = other.buf; 750 move_shape_from(std::forward<Buffer<T, D>>(other)); 751 other.buf = halide_buffer_t(); 752 return *this; 753 } 754 755 /** Check the product of the extents fits in memory. */ 756 void check_overflow() { 757 size_t size = type().bytes(); 758 for (int i = 0; i < dimensions(); i++) { 759 size *= dim(i).extent(); 760 } 761 // We allow 2^31 or 2^63 bytes, so drop the top bit. 762 size = (size << 1) >> 1; 763 for (int i = 0; i < dimensions(); i++) { 764 size /= dim(i).extent(); 765 } 766 assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer."); 767 } 768 769 /** Allocate memory for this Buffer. Drops the reference to any 770 * owned memory. */ 771 void allocate(void *(*allocate_fn)(size_t) = nullptr, 772 void (*deallocate_fn)(void *) = nullptr) { 773 if (!allocate_fn) { 774 allocate_fn = malloc; 775 } 776 if (!deallocate_fn) { 777 deallocate_fn = free; 778 } 779 780 // Drop any existing allocation 781 deallocate(); 782 783 // Conservatively align images to 128 bytes. This is enough 784 // alignment for all the platforms we might use. 785 size_t size = size_in_bytes(); 786 const size_t alignment = 128; 787 size = (size + alignment - 1) & ~(alignment - 1); 788 void *alloc_storage = allocate_fn(size + sizeof(AllocationHeader) + alignment - 1); 789 alloc = new (alloc_storage) AllocationHeader(deallocate_fn); 790 uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader); 791 buf.host = (uint8_t *)((uintptr_t)(unaligned_ptr + alignment - 1) & ~(alignment - 1)); 792 } 793 794 /** Drop reference to any owned host or device memory, possibly 795 * freeing it, if this buffer held the last reference to 796 * it. Retains the shape of the buffer. Does nothing if this 797 * buffer did not allocate its own memory. */ 798 void deallocate() { 799 decref(); 800 } 801 802 /** Drop reference to any owned device memory, possibly freeing it 803 * if this buffer held the last reference to it. Asserts that 804 * device_dirty is false. */ 805 void device_deallocate() { 806 decref_dev(); 807 } 808 809 /** Allocate a new image of the given size with a runtime 810 * type. Only used when you do know what size you want but you 811 * don't know statically what type the elements are. Pass zeroes 812 * to make a buffer suitable for bounds query calls. */ 813 template<typename... Args, 814 typename = typename std::enable_if<AllInts<Args...>::value>::type> 815 Buffer(halide_type_t t, int first, Args... rest) { 816 if (!T_is_void) { 817 assert(static_halide_type() == t); 818 } 819 int extents[] = {first, (int)rest...}; 820 buf.type = t; 821 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest)); 822 make_shape_storage(buf_dimensions); 823 initialize_shape(extents); 824 if (!Internal::any_zero(extents)) { 825 check_overflow(); 826 allocate(); 827 } 828 } 829 830 /** Allocate a new image of the given size. Pass zeroes to make a 831 * buffer suitable for bounds query calls. */ 832 // @{ 833 834 // The overload with one argument is 'explicit', so that 835 // (say) int is not implicitly convertable to Buffer<int> 836 explicit Buffer(int first) { 837 static_assert(!T_is_void, 838 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor"); 839 int extents[] = {first}; 840 buf.type = static_halide_type(); 841 constexpr int buf_dimensions = 1; 842 make_shape_storage(buf_dimensions); 843 initialize_shape(extents); 844 if (first != 0) { 845 check_overflow(); 846 allocate(); 847 } 848 } 849 850 template<typename... Args, 851 typename = typename std::enable_if<AllInts<Args...>::value>::type> 852 Buffer(int first, int second, Args... rest) { 853 static_assert(!T_is_void, 854 "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor"); 855 int extents[] = {first, second, (int)rest...}; 856 buf.type = static_halide_type(); 857 constexpr int buf_dimensions = 2 + (int)(sizeof...(rest)); 858 make_shape_storage(buf_dimensions); 859 initialize_shape(extents); 860 if (!Internal::any_zero(extents)) { 861 check_overflow(); 862 allocate(); 863 } 864 } 865 // @} 866 867 /** Allocate a new image of unknown type using a vector of ints as the size. */ 868 Buffer(halide_type_t t, const std::vector<int> &sizes) { 869 if (!T_is_void) { 870 assert(static_halide_type() == t); 871 } 872 buf.type = t; 873 make_shape_storage((int)sizes.size()); 874 initialize_shape(sizes); 875 if (!Internal::any_zero(sizes)) { 876 check_overflow(); 877 allocate(); 878 } 879 } 880 881 /** Allocate a new image of known type using a vector of ints as the size. */ 882 explicit Buffer(const std::vector<int> &sizes) 883 : Buffer(static_halide_type(), sizes) { 884 } 885 886 private: 887 // Create a copy of the sizes vector, ordered as specified by order. 888 static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) { 889 assert(order.size() == sizes.size()); 890 std::vector<int> ordered_sizes(sizes.size()); 891 for (size_t i = 0; i < sizes.size(); ++i) { 892 ordered_sizes[i] = sizes.at(order[i]); 893 } 894 return ordered_sizes; 895 } 896 897 public: 898 /** Allocate a new image of unknown type using a vector of ints as the size and 899 * a vector of indices indicating the storage order for each dimension. The 900 * length of the sizes vector and the storage-order vector must match. For instance, 901 * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */ 902 Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order) 903 : Buffer(t, make_ordered_sizes(sizes, storage_order)) { 904 transpose(storage_order); 905 } 906 907 Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order) 908 : Buffer(static_halide_type(), sizes, storage_order) { 909 } 910 911 /** Make an Buffer that refers to a statically sized array. Does not 912 * take ownership of the data, and does not set the host_dirty flag. */ 913 template<typename Array, size_t N> 914 explicit Buffer(Array (&vals)[N]) { 915 const int buf_dimensions = dimensionality_of_array(vals); 916 buf.type = scalar_type_of_array(vals); 917 buf.host = (uint8_t *)vals; 918 make_shape_storage(buf_dimensions); 919 initialize_shape_from_array_shape(buf.dimensions - 1, vals); 920 } 921 922 /** Initialize an Buffer of runtime type from a pointer and some 923 * sizes. Assumes dense row-major packing and a min coordinate of 924 * zero. Does not take ownership of the data and does not set the 925 * host_dirty flag. */ 926 template<typename... Args, 927 typename = typename std::enable_if<AllInts<Args...>::value>::type> 928 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&... rest) { 929 if (!T_is_void) { 930 assert(static_halide_type() == t); 931 } 932 int extents[] = {first, (int)rest...}; 933 buf.type = t; 934 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest)); 935 buf.host = (uint8_t *)const_cast<void *>(data); 936 make_shape_storage(buf_dimensions); 937 initialize_shape(extents); 938 } 939 940 /** Initialize an Buffer from a pointer and some sizes. Assumes 941 * dense row-major packing and a min coordinate of zero. Does not 942 * take ownership of the data and does not set the host_dirty flag. */ 943 template<typename... Args, 944 typename = typename std::enable_if<AllInts<Args...>::value>::type> 945 explicit Buffer(T *data, int first, Args &&... rest) { 946 int extents[] = {first, (int)rest...}; 947 buf.type = static_halide_type(); 948 constexpr int buf_dimensions = 1 + (int)(sizeof...(rest)); 949 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data); 950 make_shape_storage(buf_dimensions); 951 initialize_shape(extents); 952 } 953 954 /** Initialize an Buffer from a pointer and a vector of 955 * sizes. Assumes dense row-major packing and a min coordinate of 956 * zero. Does not take ownership of the data and does not set the 957 * host_dirty flag. */ 958 explicit Buffer(T *data, const std::vector<int> &sizes) { 959 buf.type = static_halide_type(); 960 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data); 961 make_shape_storage((int)sizes.size()); 962 initialize_shape(sizes); 963 } 964 965 /** Initialize an Buffer of runtime type from a pointer and a 966 * vector of sizes. Assumes dense row-major packing and a min 967 * coordinate of zero. Does not take ownership of the data and 968 * does not set the host_dirty flag. */ 969 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) { 970 if (!T_is_void) { 971 assert(static_halide_type() == t); 972 } 973 buf.type = t; 974 buf.host = (uint8_t *)const_cast<void *>(data); 975 make_shape_storage((int)sizes.size()); 976 initialize_shape(sizes); 977 } 978 979 /** Initialize an Buffer from a pointer to the min coordinate and 980 * an array describing the shape. Does not take ownership of the 981 * data, and does not set the host_dirty flag. */ 982 explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) { 983 if (!T_is_void) { 984 assert(static_halide_type() == t); 985 } 986 buf.type = t; 987 buf.host = (uint8_t *)const_cast<void *>(data); 988 make_shape_storage(d); 989 for (int i = 0; i < d; i++) { 990 buf.dim[i] = shape[i]; 991 } 992 } 993 994 /** Initialize a Buffer from a pointer to the min coordinate and 995 * a vector describing the shape. Does not take ownership of the 996 * data, and does not set the host_dirty flag. */ 997 explicit inline Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, 998 const std::vector<halide_dimension_t> &shape) 999 : Buffer(t, data, (int)shape.size(), shape.data()) { 1000 } 1001 1002 /** Initialize an Buffer from a pointer to the min coordinate and 1003 * an array describing the shape. Does not take ownership of the 1004 * data and does not set the host_dirty flag. */ 1005 explicit Buffer(T *data, int d, const halide_dimension_t *shape) { 1006 buf.type = static_halide_type(); 1007 buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data); 1008 make_shape_storage(d); 1009 for (int i = 0; i < d; i++) { 1010 buf.dim[i] = shape[i]; 1011 } 1012 } 1013 1014 /** Initialize a Buffer from a pointer to the min coordinate and 1015 * a vector describing the shape. Does not take ownership of the 1016 * data, and does not set the host_dirty flag. */ 1017 explicit inline Buffer(T *data, const std::vector<halide_dimension_t> &shape) 1018 : Buffer(data, (int)shape.size(), shape.data()) { 1019 } 1020 1021 /** Destructor. Will release any underlying owned allocation if 1022 * this is the last reference to it. Will assert fail if there are 1023 * weak references to this Buffer outstanding. */ 1024 ~Buffer() { 1025 free_shape_storage(); 1026 decref(); 1027 } 1028 1029 /** Get a pointer to the raw halide_buffer_t this wraps. */ 1030 // @{ 1031 halide_buffer_t *raw_buffer() { 1032 return &buf; 1033 } 1034 1035 const halide_buffer_t *raw_buffer() const { 1036 return &buf; 1037 } 1038 // @} 1039 1040 /** Provide a cast operator to halide_buffer_t *, so that 1041 * instances can be passed directly to Halide filters. */ 1042 operator halide_buffer_t *() { 1043 return &buf; 1044 } 1045 1046 /** Return a typed reference to this Buffer. Useful for converting 1047 * a reference to a Buffer<void> to a reference to, for example, a 1048 * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&. 1049 * Does a runtime assert if the source buffer type is void. */ 1050 template<typename T2, int D2 = D, 1051 typename = typename std::enable_if<(D2 <= D)>::type> 1052 HALIDE_ALWAYS_INLINE 1053 Buffer<T2, D2> & 1054 as() & { 1055 Buffer<T2, D>::assert_can_convert_from(*this); 1056 return *((Buffer<T2, D2> *)this); 1057 } 1058 1059 /** Return a const typed reference to this Buffer. Useful for 1060 * converting a conference reference to one Buffer type to a const 1061 * reference to another Buffer type. Does a runtime assert if the 1062 * source buffer type is void. */ 1063 template<typename T2, int D2 = D, 1064 typename = typename std::enable_if<(D2 <= D)>::type> 1065 HALIDE_ALWAYS_INLINE const Buffer<T2, D2> &as() const & { 1066 Buffer<T2, D>::assert_can_convert_from(*this); 1067 return *((const Buffer<T2, D2> *)this); 1068 } 1069 1070 /** Returns this rval Buffer with a different type attached. Does 1071 * a dynamic type check if the source type is void. */ 1072 template<typename T2, int D2 = D> 1073 HALIDE_ALWAYS_INLINE 1074 Buffer<T2, D2> 1075 as() && { 1076 Buffer<T2, D2>::assert_can_convert_from(*this); 1077 return *((Buffer<T2, D2> *)this); 1078 } 1079 1080 /** as_const() is syntactic sugar for .as<const T>(), to avoid the need 1081 * to recapitulate the type argument. */ 1082 // @{ 1083 HALIDE_ALWAYS_INLINE 1084 Buffer<typename std::add_const<T>::type, D> &as_const() & { 1085 // Note that we can skip the assert_can_convert_from(), since T -> const T 1086 // conversion is always legal. 1087 return *((Buffer<typename std::add_const<T>::type> *)this); 1088 } 1089 1090 HALIDE_ALWAYS_INLINE 1091 const Buffer<typename std::add_const<T>::type, D> &as_const() const & { 1092 return *((const Buffer<typename std::add_const<T>::type> *)this); 1093 } 1094 1095 HALIDE_ALWAYS_INLINE 1096 Buffer<typename std::add_const<T>::type, D> as_const() && { 1097 return *((Buffer<typename std::add_const<T>::type> *)this); 1098 } 1099 // @} 1100 1101 /** Conventional names for the first three dimensions. */ 1102 // @{ 1103 int width() const { 1104 return (dimensions() > 0) ? dim(0).extent() : 1; 1105 } 1106 int height() const { 1107 return (dimensions() > 1) ? dim(1).extent() : 1; 1108 } 1109 int channels() const { 1110 return (dimensions() > 2) ? dim(2).extent() : 1; 1111 } 1112 // @} 1113 1114 /** Conventional names for the min and max value of each dimension */ 1115 // @{ 1116 int left() const { 1117 return dim(0).min(); 1118 } 1119 1120 int right() const { 1121 return dim(0).max(); 1122 } 1123 1124 int top() const { 1125 return dim(1).min(); 1126 } 1127 1128 int bottom() const { 1129 return dim(1).max(); 1130 } 1131 // @} 1132 1133 /** Make a new image which is a deep copy of this image. Use crop 1134 * or slice followed by copy to make a copy of only a portion of 1135 * the image. The new image uses the same memory layout as the 1136 * original, with holes compacted away. Note that the returned 1137 * Buffer is always of a non-const type T (ie: 1138 * 1139 * Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T> 1140 * 1141 * which is always safe, since we are making a deep copy. (The caller 1142 * can easily cast it back to Buffer<const T> if desired, which is 1143 * always safe and free.) 1144 */ 1145 Buffer<not_const_T, D> copy(void *(*allocate_fn)(size_t) = nullptr, 1146 void (*deallocate_fn)(void *) = nullptr) const { 1147 Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_with_shape_of(*this, allocate_fn, deallocate_fn); 1148 dst.copy_from(*this); 1149 return dst; 1150 } 1151 1152 /** Like copy(), but the copy is created in interleaved memory layout 1153 * (vs. keeping the same memory layout as the original). Requires that 'this' 1154 * has exactly 3 dimensions. 1155 */ 1156 Buffer<not_const_T, D> copy_to_interleaved(void *(*allocate_fn)(size_t) = nullptr, 1157 void (*deallocate_fn)(void *) = nullptr) const { 1158 assert(dimensions() == 3); 1159 Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_interleaved(nullptr, width(), height(), channels()); 1160 dst.set_min(min(0), min(1), min(2)); 1161 dst.allocate(allocate_fn, deallocate_fn); 1162 dst.copy_from(*this); 1163 return dst; 1164 } 1165 1166 /** Like copy(), but the copy is created in planar memory layout 1167 * (vs. keeping the same memory layout as the original). 1168 */ 1169 Buffer<not_const_T, D> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr, 1170 void (*deallocate_fn)(void *) = nullptr) const { 1171 std::vector<int> mins, extents; 1172 const int dims = dimensions(); 1173 mins.reserve(dims); 1174 extents.reserve(dims); 1175 for (int d = 0; d < dims; ++d) { 1176 mins.push_back(dim(d).min()); 1177 extents.push_back(dim(d).extent()); 1178 } 1179 Buffer<not_const_T, D> dst = Buffer<not_const_T, D>(nullptr, extents); 1180 dst.set_min(mins); 1181 dst.allocate(allocate_fn, deallocate_fn); 1182 dst.copy_from(*this); 1183 return dst; 1184 } 1185 1186 /** Make a copy of the Buffer which shares the underlying host and/or device 1187 * allocations as the existing Buffer. This is purely syntactic sugar for 1188 * cases where you have a const reference to a Buffer but need a temporary 1189 * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse 1190 * inline way to create a temporary. \code 1191 * void call_my_func(const Buffer<const uint8_t>& input) { 1192 * my_func(input.alias(), output); 1193 * }\endcode 1194 */ 1195 inline Buffer<T, D> alias() const { 1196 return *this; 1197 } 1198 1199 /** Fill a Buffer with the values at the same coordinates in 1200 * another Buffer. Restricts itself to coordinates contained 1201 * within the intersection of the two buffers. If the two Buffers 1202 * are not in the same coordinate system, you will need to 1203 * translate the argument Buffer first. E.g. if you're blitting a 1204 * sprite onto a framebuffer, you'll want to translate the sprite 1205 * to the correct location first like so: \code 1206 * framebuffer.copy_from(sprite.translated({x, y})); \endcode 1207 */ 1208 template<typename T2, int D2> 1209 void copy_from(const Buffer<T2, D2> &other) { 1210 static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>"); 1211 assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination."); 1212 assert(!other.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source."); 1213 1214 Buffer<const T, D> src(other); 1215 Buffer<T, D> dst(*this); 1216 1217 assert(src.dimensions() == dst.dimensions()); 1218 1219 // Trim the copy to the region in common 1220 for (int i = 0; i < dimensions(); i++) { 1221 int min_coord = std::max(dst.dim(i).min(), src.dim(i).min()); 1222 int max_coord = std::min(dst.dim(i).max(), src.dim(i).max()); 1223 if (max_coord < min_coord) { 1224 // The buffers do not overlap. 1225 return; 1226 } 1227 dst.crop(i, min_coord, max_coord - min_coord + 1); 1228 src.crop(i, min_coord, max_coord - min_coord + 1); 1229 } 1230 1231 // If T is void, we need to do runtime dispatch to an 1232 // appropriately-typed lambda. We're copying, so we only care 1233 // about the element size. (If not, this should optimize away 1234 // into a static dispatch to the right-sized copy.) 1235 if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) { 1236 using MemType = uint8_t; 1237 auto &typed_dst = (Buffer<MemType, D> &)dst; 1238 auto &typed_src = (Buffer<const MemType, D> &)src; 1239 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); 1240 } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) { 1241 using MemType = uint16_t; 1242 auto &typed_dst = (Buffer<MemType, D> &)dst; 1243 auto &typed_src = (Buffer<const MemType, D> &)src; 1244 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); 1245 } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) { 1246 using MemType = uint32_t; 1247 auto &typed_dst = (Buffer<MemType, D> &)dst; 1248 auto &typed_src = (Buffer<const MemType, D> &)src; 1249 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); 1250 } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) { 1251 using MemType = uint64_t; 1252 auto &typed_dst = (Buffer<MemType, D> &)dst; 1253 auto &typed_src = (Buffer<const MemType, D> &)src; 1254 typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src); 1255 } else { 1256 assert(false && "type().bytes() must be 1, 2, 4, or 8"); 1257 } 1258 set_host_dirty(); 1259 } 1260 1261 /** Make an image that refers to a sub-range of this image along 1262 * the given dimension. Asserts that the crop region is within 1263 * the existing bounds: you cannot "crop outwards", even if you know there 1264 * is valid Buffer storage (e.g. because you already cropped inwards). */ 1265 Buffer<T, D> cropped(int d, int min, int extent) const { 1266 // Make a fresh copy of the underlying buffer (but not a fresh 1267 // copy of the allocation, if there is one). 1268 Buffer<T, D> im = *this; 1269 1270 // This guarantees the prexisting device ref is dropped if the 1271 // device_crop call fails and maintains the buffer in a consistent 1272 // state. 1273 im.device_deallocate(); 1274 1275 im.crop_host(d, min, extent); 1276 if (buf.device_interface != nullptr) { 1277 complete_device_crop(im); 1278 } 1279 return im; 1280 } 1281 1282 /** Crop an image in-place along the given dimension. This does 1283 * not move any data around in memory - it just changes the min 1284 * and extent of the given dimension. */ 1285 void crop(int d, int min, int extent) { 1286 // An optimization for non-device buffers. For the device case, 1287 // a temp buffer is required, so reuse the not-in-place version. 1288 // TODO(zalman|abadams): Are nop crops common enough to special 1289 // case the device part of the if to do nothing? 1290 if (buf.device_interface != nullptr) { 1291 *this = cropped(d, min, extent); 1292 } else { 1293 crop_host(d, min, extent); 1294 } 1295 } 1296 1297 /** Make an image that refers to a sub-rectangle of this image along 1298 * the first N dimensions. Asserts that the crop region is within 1299 * the existing bounds. The cropped image may drop any device handle 1300 * if the device_interface cannot accomplish the crop in-place. */ 1301 Buffer<T, D> cropped(const std::vector<std::pair<int, int>> &rect) const { 1302 // Make a fresh copy of the underlying buffer (but not a fresh 1303 // copy of the allocation, if there is one). 1304 Buffer<T, D> im = *this; 1305 1306 // This guarantees the prexisting device ref is dropped if the 1307 // device_crop call fails and maintains the buffer in a consistent 1308 // state. 1309 im.device_deallocate(); 1310 1311 im.crop_host(rect); 1312 if (buf.device_interface != nullptr) { 1313 complete_device_crop(im); 1314 } 1315 return im; 1316 } 1317 1318 /** Crop an image in-place along the first N dimensions. This does 1319 * not move any data around in memory, nor does it free memory. It 1320 * just rewrites the min/extent of each dimension to refer to a 1321 * subregion of the same allocation. */ 1322 void crop(const std::vector<std::pair<int, int>> &rect) { 1323 // An optimization for non-device buffers. For the device case, 1324 // a temp buffer is required, so reuse the not-in-place version. 1325 // TODO(zalman|abadams): Are nop crops common enough to special 1326 // case the device part of the if to do nothing? 1327 if (buf.device_interface != nullptr) { 1328 *this = cropped(rect); 1329 } else { 1330 crop_host(rect); 1331 } 1332 } 1333 1334 /** Make an image which refers to the same data with using 1335 * translated coordinates in the given dimension. Positive values 1336 * move the image data to the right or down relative to the 1337 * coordinate system. Drops any device handle. */ 1338 Buffer<T, D> translated(int d, int dx) const { 1339 Buffer<T, D> im = *this; 1340 im.translate(d, dx); 1341 return im; 1342 } 1343 1344 /** Translate an image in-place along one dimension by changing 1345 * how it is indexed. Does not move any data around in memory. */ 1346 void translate(int d, int delta) { 1347 assert(d >= 0 && d < this->dimensions()); 1348 device_deallocate(); 1349 buf.dim[d].min += delta; 1350 } 1351 1352 /** Make an image which refers to the same data translated along 1353 * the first N dimensions. */ 1354 Buffer<T, D> translated(const std::vector<int> &delta) const { 1355 Buffer<T, D> im = *this; 1356 im.translate(delta); 1357 return im; 1358 } 1359 1360 /** Translate an image along the first N dimensions by changing 1361 * how it is indexed. Does not move any data around in memory. */ 1362 void translate(const std::vector<int> &delta) { 1363 device_deallocate(); 1364 assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max())); 1365 int limit = (int)delta.size(); 1366 assert(limit <= dimensions()); 1367 for (int i = 0; i < limit; i++) { 1368 translate(i, delta[i]); 1369 } 1370 } 1371 1372 /** Set the min coordinate of an image in the first N dimensions. */ 1373 // @{ 1374 void set_min(const std::vector<int> &mins) { 1375 assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions())); 1376 device_deallocate(); 1377 for (size_t i = 0; i < mins.size(); i++) { 1378 buf.dim[i].min = mins[i]; 1379 } 1380 } 1381 1382 template<typename... Args> 1383 void set_min(Args... args) { 1384 set_min(std::vector<int>{args...}); 1385 } 1386 // @} 1387 1388 /** Test if a given coordinate is within the bounds of an image. */ 1389 // @{ 1390 bool contains(const std::vector<int> &coords) const { 1391 assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions())); 1392 for (size_t i = 0; i < coords.size(); i++) { 1393 if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) { 1394 return false; 1395 } 1396 } 1397 return true; 1398 } 1399 1400 template<typename... Args> 1401 bool contains(Args... args) const { 1402 return contains(std::vector<int>{args...}); 1403 } 1404 // @} 1405 1406 /** Make a buffer which refers to the same data in the same layout 1407 * using a swapped indexing order for the dimensions given. So 1408 * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more 1409 * strongly that A.address_of(i, j) == B.address_of(j, i). */ 1410 Buffer<T, D> transposed(int d1, int d2) const { 1411 Buffer<T, D> im = *this; 1412 im.transpose(d1, d2); 1413 return im; 1414 } 1415 1416 /** Transpose a buffer in-place by changing how it is indexed. For 1417 * example, transpose(0, 1) on a two-dimensional buffer means that 1418 * the value referred to by coordinates (i, j) is now reached at 1419 * the coordinates (j, i), and vice versa. This is done by 1420 * reordering the per-dimension metadata rather than by moving 1421 * data around in memory, so other views of the same memory will 1422 * not see the data as having been transposed. */ 1423 void transpose(int d1, int d2) { 1424 assert(d1 >= 0 && d1 < this->dimensions()); 1425 assert(d2 >= 0 && d2 < this->dimensions()); 1426 std::swap(buf.dim[d1], buf.dim[d2]); 1427 } 1428 1429 /** A generalized transpose: instead of swapping two dimensions, 1430 * pass a vector that lists each dimension index exactly once, in 1431 * the desired order. This does not move any data around in memory 1432 * - it just permutes how it is indexed. */ 1433 void transpose(const std::vector<int> &order) { 1434 assert((int)order.size() == dimensions()); 1435 if (dimensions() < 2) { 1436 // My, that was easy 1437 return; 1438 } 1439 1440 std::vector<int> order_sorted = order; 1441 for (size_t i = 1; i < order_sorted.size(); i++) { 1442 for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) { 1443 std::swap(order_sorted[j], order_sorted[j - 1]); 1444 transpose(j, j - 1); 1445 } 1446 } 1447 } 1448 1449 /** Make a buffer which refers to the same data in the same 1450 * layout using a different ordering of the dimensions. */ 1451 Buffer<T, D> transposed(const std::vector<int> &order) const { 1452 Buffer<T, D> im = *this; 1453 im.transpose(order); 1454 return im; 1455 } 1456 1457 /** Make a lower-dimensional buffer that refers to one slice of 1458 * this buffer. */ 1459 Buffer<T, D> sliced(int d, int pos) const { 1460 Buffer<T, D> im = *this; 1461 1462 // This guarantees the prexisting device ref is dropped if the 1463 // device_slice call fails and maintains the buffer in a consistent 1464 // state. 1465 im.device_deallocate(); 1466 1467 im.slice_host(d, pos); 1468 if (buf.device_interface != nullptr) { 1469 complete_device_slice(im, d, pos); 1470 } 1471 return im; 1472 } 1473 1474 /** Make a lower-dimensional buffer that refers to one slice of this 1475 * buffer at the dimension's minimum. */ 1476 inline Buffer<T, D> sliced(int d) const { 1477 return sliced(d, dim(d).min()); 1478 } 1479 1480 /** Rewrite the buffer to refer to a single lower-dimensional 1481 * slice of itself along the given dimension at the given 1482 * coordinate. Does not move any data around or free the original 1483 * memory, so other views of the same data are unaffected. */ 1484 void slice(int d, int pos) { 1485 // An optimization for non-device buffers. For the device case, 1486 // a temp buffer is required, so reuse the not-in-place version. 1487 // TODO(zalman|abadams): Are nop slices common enough to special 1488 // case the device part of the if to do nothing? 1489 if (buf.device_interface != nullptr) { 1490 *this = sliced(d, pos); 1491 } else { 1492 slice_host(d, pos); 1493 } 1494 } 1495 1496 /** Slice a buffer in-place at the dimension's minimum. */ 1497 inline void slice(int d) { 1498 slice(d, dim(d).min()); 1499 } 1500 1501 /** Make a new buffer that views this buffer as a single slice in a 1502 * higher-dimensional space. The new dimension has extent one and 1503 * the given min. This operation is the opposite of slice. As an 1504 * example, the following condition is true: 1505 * 1506 \code 1507 im2 = im.embedded(1, 17); 1508 &im(x, y, c) == &im2(x, 17, y, c); 1509 \endcode 1510 */ 1511 Buffer<T, D> embedded(int d, int pos = 0) const { 1512 Buffer<T, D> im(*this); 1513 im.embed(d, pos); 1514 return im; 1515 } 1516 1517 /** Embed a buffer in-place, increasing the 1518 * dimensionality. */ 1519 void embed(int d, int pos = 0) { 1520 assert(d >= 0 && d <= dimensions()); 1521 add_dimension(); 1522 translate(dimensions() - 1, pos); 1523 for (int i = dimensions() - 1; i > d; i--) { 1524 transpose(i, i - 1); 1525 } 1526 } 1527 1528 /** Add a new dimension with a min of zero and an extent of 1529 * one. The stride is the extent of the outermost dimension times 1530 * its stride. The new dimension is the last dimension. This is a 1531 * special case of embed. */ 1532 void add_dimension() { 1533 const int dims = buf.dimensions; 1534 buf.dimensions++; 1535 if (buf.dim != shape) { 1536 // We're already on the heap. Reallocate. 1537 halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions]; 1538 for (int i = 0; i < dims; i++) { 1539 new_shape[i] = buf.dim[i]; 1540 } 1541 delete[] buf.dim; 1542 buf.dim = new_shape; 1543 } else if (dims == D) { 1544 // Transition from the in-class storage to the heap 1545 make_shape_storage(buf.dimensions); 1546 for (int i = 0; i < dims; i++) { 1547 buf.dim[i] = shape[i]; 1548 } 1549 } else { 1550 // We still fit in the class 1551 } 1552 buf.dim[dims] = {0, 1, 0}; 1553 if (dims == 0) { 1554 buf.dim[dims].stride = 1; 1555 } else { 1556 buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride; 1557 } 1558 } 1559 1560 /** Add a new dimension with a min of zero, an extent of one, and 1561 * the specified stride. The new dimension is the last 1562 * dimension. This is a special case of embed. */ 1563 void add_dimension_with_stride(int s) { 1564 add_dimension(); 1565 buf.dim[buf.dimensions - 1].stride = s; 1566 } 1567 1568 /** Methods for managing any GPU allocation. */ 1569 // @{ 1570 // Set the host dirty flag. Called by every operator() 1571 // access. Must be inlined so it can be hoisted out of loops. 1572 HALIDE_ALWAYS_INLINE 1573 void set_host_dirty(bool v = true) { 1574 assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host."); 1575 buf.set_host_dirty(v); 1576 } 1577 1578 // Check if the device allocation is dirty. Called by 1579 // set_host_dirty, which is called by every accessor. Must be 1580 // inlined so it can be hoisted out of loops. 1581 HALIDE_ALWAYS_INLINE 1582 bool device_dirty() const { 1583 return buf.device_dirty(); 1584 } 1585 1586 bool host_dirty() const { 1587 return buf.host_dirty(); 1588 } 1589 1590 void set_device_dirty(bool v = true) { 1591 assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty."); 1592 buf.set_device_dirty(v); 1593 } 1594 1595 int copy_to_host(void *ctx = nullptr) { 1596 if (device_dirty()) { 1597 return buf.device_interface->copy_to_host(ctx, &buf); 1598 } 1599 return 0; 1600 } 1601 1602 int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { 1603 if (host_dirty()) { 1604 return device_interface->copy_to_device(ctx, &buf, device_interface); 1605 } 1606 return 0; 1607 } 1608 1609 int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { 1610 return device_interface->device_malloc(ctx, &buf, device_interface); 1611 } 1612 1613 int device_free(void *ctx = nullptr) { 1614 if (dev_ref_count) { 1615 assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated && 1616 "Can't call device_free on an unmanaged or wrapped native device handle. " 1617 "Free the source allocation or call device_detach_native instead."); 1618 // Multiple people may be holding onto this dev field 1619 assert(dev_ref_count->count == 1 && 1620 "Multiple Halide::Runtime::Buffer objects share this device " 1621 "allocation. Freeing it would create dangling references. " 1622 "Don't call device_free on Halide buffers that you have copied or " 1623 "passed by value."); 1624 } 1625 int ret = 0; 1626 if (buf.device_interface) { 1627 ret = buf.device_interface->device_free(ctx, &buf); 1628 } 1629 if (dev_ref_count) { 1630 delete dev_ref_count; 1631 dev_ref_count = nullptr; 1632 } 1633 return ret; 1634 } 1635 1636 int device_wrap_native(const struct halide_device_interface_t *device_interface, 1637 uint64_t handle, void *ctx = nullptr) { 1638 assert(device_interface); 1639 dev_ref_count = new DeviceRefCount; 1640 dev_ref_count->ownership = BufferDeviceOwnership::WrappedNative; 1641 return device_interface->wrap_native(ctx, &buf, handle, device_interface); 1642 } 1643 1644 int device_detach_native(void *ctx = nullptr) { 1645 assert(dev_ref_count && 1646 dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative && 1647 "Only call device_detach_native on buffers wrapping a native " 1648 "device handle via device_wrap_native. This buffer was allocated " 1649 "using device_malloc, or is unmanaged. " 1650 "Call device_free or free the original allocation instead."); 1651 // Multiple people may be holding onto this dev field 1652 assert(dev_ref_count->count == 1 && 1653 "Multiple Halide::Runtime::Buffer objects share this device " 1654 "allocation. Freeing it could create dangling references. " 1655 "Don't call device_detach_native on Halide buffers that you " 1656 "have copied or passed by value."); 1657 int ret = 0; 1658 if (buf.device_interface) { 1659 ret = buf.device_interface->detach_native(ctx, &buf); 1660 } 1661 delete dev_ref_count; 1662 dev_ref_count = nullptr; 1663 return ret; 1664 } 1665 1666 int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { 1667 return device_interface->device_and_host_malloc(ctx, &buf, device_interface); 1668 } 1669 1670 int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) { 1671 if (dev_ref_count) { 1672 assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost && 1673 "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. " 1674 "Free the source allocation or call device_detach_native instead."); 1675 // Multiple people may be holding onto this dev field 1676 assert(dev_ref_count->count == 1 && 1677 "Multiple Halide::Runtime::Buffer objects share this device " 1678 "allocation. Freeing it would create dangling references. " 1679 "Don't call device_and_host_free on Halide buffers that you have copied or " 1680 "passed by value."); 1681 } 1682 int ret = 0; 1683 if (buf.device_interface) { 1684 ret = buf.device_interface->device_and_host_free(ctx, &buf); 1685 } 1686 if (dev_ref_count) { 1687 delete dev_ref_count; 1688 dev_ref_count = nullptr; 1689 } 1690 return ret; 1691 } 1692 1693 int device_sync(void *ctx = nullptr) { 1694 if (buf.device_interface) { 1695 return buf.device_interface->device_sync(ctx, &buf); 1696 } else { 1697 return 0; 1698 } 1699 } 1700 1701 bool has_device_allocation() const { 1702 return buf.device != 0; 1703 } 1704 1705 /** Return the method by which the device field is managed. */ 1706 BufferDeviceOwnership device_ownership() const { 1707 if (dev_ref_count == nullptr) { 1708 return BufferDeviceOwnership::Allocated; 1709 } 1710 return dev_ref_count->ownership; 1711 } 1712 // @} 1713 1714 /** If you use the (x, y, c) indexing convention, then Halide 1715 * Buffers are stored planar by default. This function constructs 1716 * an interleaved RGB or RGBA image that can still be indexed 1717 * using (x, y, c). Passing it to a generator requires that the 1718 * generator has been compiled with support for interleaved (also 1719 * known as packed or chunky) memory layouts. */ 1720 static Buffer<void, D> make_interleaved(halide_type_t t, int width, int height, int channels) { 1721 Buffer<void, D> im(t, channels, width, height); 1722 // Note that this is equivalent to calling transpose({2, 0, 1}), 1723 // but slightly more efficient. 1724 im.transpose(0, 1); 1725 im.transpose(1, 2); 1726 return im; 1727 } 1728 1729 /** If you use the (x, y, c) indexing convention, then Halide 1730 * Buffers are stored planar by default. This function constructs 1731 * an interleaved RGB or RGBA image that can still be indexed 1732 * using (x, y, c). Passing it to a generator requires that the 1733 * generator has been compiled with support for interleaved (also 1734 * known as packed or chunky) memory layouts. */ 1735 static Buffer<T, D> make_interleaved(int width, int height, int channels) { 1736 return make_interleaved(static_halide_type(), width, height, channels); 1737 } 1738 1739 /** Wrap an existing interleaved image. */ 1740 static Buffer<add_const_if_T_is_const<void>, D> 1741 make_interleaved(halide_type_t t, T *data, int width, int height, int channels) { 1742 Buffer<add_const_if_T_is_const<void>, D> im(t, data, channels, width, height); 1743 im.transpose(0, 1); 1744 im.transpose(1, 2); 1745 return im; 1746 } 1747 1748 /** Wrap an existing interleaved image. */ 1749 static Buffer<T, D> make_interleaved(T *data, int width, int height, int channels) { 1750 return make_interleaved(static_halide_type(), data, width, height, channels); 1751 } 1752 1753 /** Make a zero-dimensional Buffer */ 1754 static Buffer<add_const_if_T_is_const<void>, D> make_scalar(halide_type_t t) { 1755 Buffer<add_const_if_T_is_const<void>, 1> buf(t, 1); 1756 buf.slice(0, 0); 1757 return buf; 1758 } 1759 1760 /** Make a zero-dimensional Buffer */ 1761 static Buffer<T, D> make_scalar() { 1762 Buffer<T, 1> buf(1); 1763 buf.slice(0, 0); 1764 return buf; 1765 } 1766 1767 /** Make a zero-dimensional Buffer that points to non-owned, existing data */ 1768 static Buffer<T, D> make_scalar(T *data) { 1769 Buffer<T, 1> buf(data, 1); 1770 buf.slice(0, 0); 1771 return buf; 1772 } 1773 1774 /** Make a buffer with the same shape and memory nesting order as 1775 * another buffer. It may have a different type. */ 1776 template<typename T2, int D2> 1777 static Buffer<T, D> make_with_shape_of(Buffer<T2, D2> src, 1778 void *(*allocate_fn)(size_t) = nullptr, 1779 void (*deallocate_fn)(void *) = nullptr) { 1780 1781 const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>(); 1782 return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim, 1783 allocate_fn, deallocate_fn); 1784 } 1785 1786 private: 1787 static Buffer<> make_with_shape_of_helper(halide_type_t dst_type, 1788 int dimensions, 1789 halide_dimension_t *shape, 1790 void *(*allocate_fn)(size_t), 1791 void (*deallocate_fn)(void *)) { 1792 // Reorder the dimensions of src to have strides in increasing order 1793 std::vector<int> swaps; 1794 for (int i = dimensions - 1; i > 0; i--) { 1795 for (int j = i; j > 0; j--) { 1796 if (shape[j - 1].stride > shape[j].stride) { 1797 std::swap(shape[j - 1], shape[j]); 1798 swaps.push_back(j); 1799 } 1800 } 1801 } 1802 1803 // Rewrite the strides to be dense (this messes up src, which 1804 // is why we took it by value). 1805 for (int i = 0; i < dimensions; i++) { 1806 if (i == 0) { 1807 shape[i].stride = 1; 1808 } else { 1809 shape[i].stride = shape[i - 1].extent * shape[i - 1].stride; 1810 } 1811 } 1812 1813 // Undo the dimension reordering 1814 while (!swaps.empty()) { 1815 int j = swaps.back(); 1816 std::swap(shape[j - 1], shape[j]); 1817 swaps.pop_back(); 1818 } 1819 1820 // Use an explicit runtime type, and make dst a Buffer<void>, to allow 1821 // using this method with Buffer<void> for either src or dst. 1822 Buffer<> dst(dst_type, nullptr, dimensions, shape); 1823 dst.allocate(allocate_fn, deallocate_fn); 1824 1825 return dst; 1826 } 1827 1828 template<typename... Args> 1829 HALIDE_ALWAYS_INLINE 1830 ptrdiff_t 1831 offset_of(int d, int first, Args... rest) const { 1832 return offset_of(d + 1, rest...) + this->buf.dim[d].stride * (first - this->buf.dim[d].min); 1833 } 1834 1835 HALIDE_ALWAYS_INLINE 1836 ptrdiff_t offset_of(int d) const { 1837 return 0; 1838 } 1839 1840 template<typename... Args> 1841 HALIDE_ALWAYS_INLINE 1842 storage_T * 1843 address_of(Args... args) const { 1844 if (T_is_void) { 1845 return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes(); 1846 } else { 1847 return (storage_T *)(this->buf.host) + offset_of(0, args...); 1848 } 1849 } 1850 1851 HALIDE_ALWAYS_INLINE 1852 ptrdiff_t offset_of(const int *pos) const { 1853 ptrdiff_t offset = 0; 1854 for (int i = this->dimensions() - 1; i >= 0; i--) { 1855 offset += this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min); 1856 } 1857 return offset; 1858 } 1859 1860 HALIDE_ALWAYS_INLINE 1861 storage_T *address_of(const int *pos) const { 1862 if (T_is_void) { 1863 return (storage_T *)this->buf.host + offset_of(pos) * type().bytes(); 1864 } else { 1865 return (storage_T *)this->buf.host + offset_of(pos); 1866 } 1867 } 1868 1869 public: 1870 /** Get a pointer to the address of the min coordinate. */ 1871 T *data() const { 1872 return (T *)(this->buf.host); 1873 } 1874 1875 /** Access elements. Use im(...) to get a reference to an element, 1876 * and use &im(...) to get the address of an element. If you pass 1877 * fewer arguments than the buffer has dimensions, the rest are 1878 * treated as their min coordinate. The non-const versions set the 1879 * host_dirty flag to true. 1880 */ 1881 //@{ 1882 template<typename... Args, 1883 typename = typename std::enable_if<AllInts<Args...>::value>::type> 1884 HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const { 1885 static_assert(!T_is_void, 1886 "Cannot use operator() on Buffer<void> types"); 1887 assert(!device_dirty()); 1888 return *((const not_void_T *)(address_of(first, rest...))); 1889 } 1890 1891 HALIDE_ALWAYS_INLINE 1892 const not_void_T & 1893 operator()() const { 1894 static_assert(!T_is_void, 1895 "Cannot use operator() on Buffer<void> types"); 1896 assert(!device_dirty()); 1897 return *((const not_void_T *)(data())); 1898 } 1899 1900 HALIDE_ALWAYS_INLINE 1901 const not_void_T & 1902 operator()(const int *pos) const { 1903 static_assert(!T_is_void, 1904 "Cannot use operator() on Buffer<void> types"); 1905 assert(!device_dirty()); 1906 return *((const not_void_T *)(address_of(pos))); 1907 } 1908 1909 template<typename... Args, 1910 typename = typename std::enable_if<AllInts<Args...>::value>::type> 1911 HALIDE_ALWAYS_INLINE 1912 not_void_T & 1913 operator()(int first, Args... rest) { 1914 static_assert(!T_is_void, 1915 "Cannot use operator() on Buffer<void> types"); 1916 set_host_dirty(); 1917 return *((not_void_T *)(address_of(first, rest...))); 1918 } 1919 1920 HALIDE_ALWAYS_INLINE 1921 not_void_T & 1922 operator()() { 1923 static_assert(!T_is_void, 1924 "Cannot use operator() on Buffer<void> types"); 1925 set_host_dirty(); 1926 return *((not_void_T *)(data())); 1927 } 1928 1929 HALIDE_ALWAYS_INLINE 1930 not_void_T & 1931 operator()(const int *pos) { 1932 static_assert(!T_is_void, 1933 "Cannot use operator() on Buffer<void> types"); 1934 set_host_dirty(); 1935 return *((not_void_T *)(address_of(pos))); 1936 } 1937 // @} 1938 1939 /** Tests that all values in this buffer are equal to val. */ 1940 bool all_equal(not_void_T val) const { 1941 bool all_equal = true; 1942 for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; }); 1943 return all_equal; 1944 } 1945 1946 Buffer<T, D> &fill(not_void_T val) { 1947 set_host_dirty(); 1948 for_each_value([=](T &v) { v = val; }); 1949 return *this; 1950 } 1951 1952 private: 1953 /** Helper functions for for_each_value. */ 1954 // @{ 1955 template<int N> 1956 struct for_each_value_task_dim { 1957 int extent; 1958 int stride[N]; 1959 }; 1960 1961 // Given an array of strides, and a bunch of pointers to pointers 1962 // (all of different types), advance the pointers using the 1963 // strides. 1964 template<typename Ptr, typename... Ptrs> 1965 HALIDE_ALWAYS_INLINE static void advance_ptrs(const int *stride, Ptr *ptr, Ptrs... ptrs) { 1966 (*ptr) += *stride; 1967 advance_ptrs(stride + 1, ptrs...); 1968 } 1969 1970 HALIDE_ALWAYS_INLINE 1971 static void advance_ptrs(const int *) { 1972 } 1973 1974 // Same as the above, but just increments the pointers. 1975 template<typename Ptr, typename... Ptrs> 1976 HALIDE_ALWAYS_INLINE static void increment_ptrs(Ptr *ptr, Ptrs... ptrs) { 1977 (*ptr)++; 1978 increment_ptrs(ptrs...); 1979 } 1980 1981 HALIDE_ALWAYS_INLINE 1982 static void increment_ptrs() { 1983 } 1984 1985 template<typename Fn, typename... Ptrs> 1986 HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one, 1987 const for_each_value_task_dim<sizeof...(Ptrs)> *t, Ptrs... ptrs) { 1988 if (d == -1) { 1989 f((*ptrs)...); 1990 } else if (d == 0) { 1991 if (innermost_strides_are_one) { 1992 for (int i = t[0].extent; i != 0; i--) { 1993 f((*ptrs)...); 1994 increment_ptrs((&ptrs)...); 1995 } 1996 } else { 1997 for (int i = t[0].extent; i != 0; i--) { 1998 f((*ptrs)...); 1999 advance_ptrs(t[0].stride, (&ptrs)...); 2000 } 2001 } 2002 } else { 2003 for (int i = t[d].extent; i != 0; i--) { 2004 for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptrs...); 2005 advance_ptrs(t[d].stride, (&ptrs)...); 2006 } 2007 } 2008 } 2009 2010 template<int N> 2011 HALIDE_NEVER_INLINE static bool for_each_value_prep(for_each_value_task_dim<N> *t, 2012 const halide_buffer_t **buffers) { 2013 // Check the buffers all have clean host allocations 2014 for (int i = 0; i < N; i++) { 2015 if (buffers[i]->device) { 2016 assert(buffers[i]->host && 2017 "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first"); 2018 assert(!buffers[i]->device_dirty() && 2019 "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first"); 2020 } else { 2021 assert(buffers[i]->host && 2022 "Buffer passed to for_each_value has no host or device allocation"); 2023 } 2024 } 2025 2026 const int dimensions = buffers[0]->dimensions; 2027 2028 // Extract the strides in all the dimensions 2029 for (int i = 0; i < dimensions; i++) { 2030 for (int j = 0; j < N; j++) { 2031 assert(buffers[j]->dimensions == dimensions); 2032 assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent && 2033 buffers[j]->dim[i].min == buffers[0]->dim[i].min); 2034 const int s = buffers[j]->dim[i].stride; 2035 t[i].stride[j] = s; 2036 } 2037 t[i].extent = buffers[0]->dim[i].extent; 2038 2039 // Order the dimensions by stride, so that the traversal is cache-coherent. 2040 for (int j = i; j > 0 && t[j].stride[0] < t[j - 1].stride[0]; j--) { 2041 std::swap(t[j], t[j - 1]); 2042 } 2043 } 2044 2045 // flatten dimensions where possible to make a larger inner 2046 // loop for autovectorization. 2047 int d = dimensions; 2048 for (int i = 1; i < d; i++) { 2049 bool flat = true; 2050 for (int j = 0; j < N; j++) { 2051 flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j]; 2052 } 2053 if (flat) { 2054 t[i - 1].extent *= t[i].extent; 2055 for (int j = i; j < d; j++) { 2056 t[j] = t[j + 1]; 2057 } 2058 i--; 2059 d--; 2060 t[d].extent = 1; 2061 } 2062 } 2063 2064 bool innermost_strides_are_one = true; 2065 if (dimensions > 0) { 2066 for (int i = 0; i < N; i++) { 2067 innermost_strides_are_one &= (t[0].stride[i] == 1); 2068 } 2069 } 2070 2071 return innermost_strides_are_one; 2072 } 2073 2074 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1> 2075 void for_each_value_impl(Fn &&f, Args &&... other_buffers) const { 2076 Buffer<>::for_each_value_task_dim<N> *t = 2077 (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA((dimensions() + 1) * sizeof(for_each_value_task_dim<N>)); 2078 // Move the preparatory code into a non-templated helper to 2079 // save code size. 2080 const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...}; 2081 bool innermost_strides_are_one = Buffer<>::for_each_value_prep(t, buffers); 2082 2083 Buffer<>::for_each_value_helper(f, dimensions() - 1, 2084 innermost_strides_are_one, 2085 t, 2086 data(), (other_buffers.data())...); 2087 } 2088 // @} 2089 2090 public: 2091 /** Call a function on every value in the buffer, and the 2092 * corresponding values in some number of other buffers of the 2093 * same size. The function should take a reference, const 2094 * reference, or value of the correct type for each buffer. This 2095 * effectively lifts a function of scalars to an element-wise 2096 * function of buffers. This produces code that the compiler can 2097 * autovectorize. This is slightly cheaper than for_each_element, 2098 * because it does not need to track the coordinates. 2099 * 2100 * Note that constness of Buffers is preserved: a const Buffer<T> (for either 2101 * 'this' or the other-buffers arguments) will allow mutation of the 2102 * buffer contents, while a Buffer<const T> will not. Attempting to specify 2103 * a mutable reference for the lambda argument of a Buffer<const T> 2104 * will result in a compilation error. */ 2105 // @{ 2106 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1> 2107 HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_value(Fn &&f, Args &&... other_buffers) const { 2108 for_each_value_impl(f, std::forward<Args>(other_buffers)...); 2109 return *this; 2110 } 2111 2112 template<typename Fn, typename... Args, int N = sizeof...(Args) + 1> 2113 HALIDE_ALWAYS_INLINE 2114 Buffer<T, D> & 2115 for_each_value(Fn &&f, Args &&... other_buffers) { 2116 for_each_value_impl(f, std::forward<Args>(other_buffers)...); 2117 return *this; 2118 } 2119 // @} 2120 2121 private: 2122 // Helper functions for for_each_element 2123 struct for_each_element_task_dim { 2124 int min, max; 2125 }; 2126 2127 /** If f is callable with this many args, call it. The first 2128 * argument is just to make the overloads distinct. Actual 2129 * overload selection is done using the enable_if. */ 2130 template<typename Fn, 2131 typename... Args, 2132 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))> 2133 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) { 2134 f(args...); 2135 } 2136 2137 /** If the above overload is impossible, we add an outer loop over 2138 * an additional argument and try again. */ 2139 template<typename Fn, 2140 typename... Args> 2141 HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) { 2142 for (int i = t[d].min; i <= t[d].max; i++) { 2143 for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...); 2144 } 2145 } 2146 2147 /** Determine the minimum number of arguments a callable can take 2148 * using the same trick. */ 2149 template<typename Fn, 2150 typename... Args, 2151 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))> 2152 HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) { 2153 return (int)(sizeof...(Args)); 2154 } 2155 2156 /** The recursive version is only enabled up to a recursion limit 2157 * of 256. This catches callables that aren't callable with any 2158 * number of ints. */ 2159 template<typename Fn, 2160 typename... Args> 2161 HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) { 2162 static_assert(sizeof...(args) <= 256, 2163 "Callable passed to for_each_element must accept either a const int *," 2164 " or up to 256 ints. No such operator found. Expect infinite template recursion."); 2165 return num_args(0, std::forward<Fn>(f), 0, args...); 2166 } 2167 2168 /** A version where the callable takes a position array instead, 2169 * with compile-time recursion on the dimensionality. This 2170 * overload is preferred to the one below using the same int vs 2171 * double trick as above, but is impossible once d hits -1 using 2172 * std::enable_if. */ 2173 template<int d, 2174 typename Fn, 2175 typename = typename std::enable_if<(d >= 0)>::type> 2176 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) { 2177 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) { 2178 for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos); 2179 } 2180 } 2181 2182 /** Base case for recursion above. */ 2183 template<int d, 2184 typename Fn, 2185 typename = typename std::enable_if<(d < 0)>::type> 2186 HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) { 2187 f(pos); 2188 } 2189 2190 /** A run-time-recursive version (instead of 2191 * compile-time-recursive) that requires the callable to take a 2192 * pointer to a position array instead. Dispatches to the 2193 * compile-time-recursive version once the dimensionality gets 2194 * small. */ 2195 template<typename Fn> 2196 static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) { 2197 if (d == -1) { 2198 f(pos); 2199 } else if (d == 0) { 2200 // Once the dimensionality gets small enough, dispatch to 2201 // a compile-time-recursive version for better codegen of 2202 // the inner loops. 2203 for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos); 2204 } else if (d == 1) { 2205 for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos); 2206 } else if (d == 2) { 2207 for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos); 2208 } else if (d == 3) { 2209 for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos); 2210 } else { 2211 for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) { 2212 for_each_element_array(d - 1, t, std::forward<Fn>(f), pos); 2213 } 2214 } 2215 } 2216 2217 /** We now have two overloads for for_each_element. This one 2218 * triggers if the callable takes a const int *. 2219 */ 2220 template<typename Fn, 2221 typename = decltype(std::declval<Fn>()((const int *)nullptr))> 2222 static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) { 2223 int *pos = (int *)HALIDE_ALLOCA(dims * sizeof(int)); 2224 for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos); 2225 } 2226 2227 /** This one triggers otherwise. It treats the callable as 2228 * something that takes some number of ints. */ 2229 template<typename Fn> 2230 HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) { 2231 int args = num_args(0, std::forward<Fn>(f)); 2232 assert(dims >= args); 2233 for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f)); 2234 } 2235 2236 template<typename Fn> 2237 void for_each_element_impl(Fn &&f) const { 2238 for_each_element_task_dim *t = 2239 (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim)); 2240 for (int i = 0; i < dimensions(); i++) { 2241 t[i].min = dim(i).min(); 2242 t[i].max = dim(i).max(); 2243 } 2244 for_each_element(0, dimensions(), t, std::forward<Fn>(f)); 2245 } 2246 2247 public: 2248 /** Call a function at each site in a buffer. This is likely to be 2249 * much slower than using Halide code to populate a buffer, but is 2250 * convenient for tests. If the function has more arguments than the 2251 * buffer has dimensions, the remaining arguments will be zero. If it 2252 * has fewer arguments than the buffer has dimensions then the last 2253 * few dimensions of the buffer are not iterated over. For example, 2254 * the following code exploits this to set a floating point RGB image 2255 * to red: 2256 2257 \code 2258 Buffer<float, 3> im(100, 100, 3); 2259 im.for_each_element([&](int x, int y) { 2260 im(x, y, 0) = 1.0f; 2261 im(x, y, 1) = 0.0f; 2262 im(x, y, 2) = 0.0f: 2263 }); 2264 \endcode 2265 2266 * The compiled code is equivalent to writing the a nested for loop, 2267 * and compilers are capable of optimizing it in the same way. 2268 * 2269 * If the callable can be called with an int * as the sole argument, 2270 * that version is called instead. Each location in the buffer is 2271 * passed to it in a coordinate array. This version is higher-overhead 2272 * than the variadic version, but is useful for writing generic code 2273 * that accepts buffers of arbitrary dimensionality. For example, the 2274 * following sets the value at all sites in an arbitrary-dimensional 2275 * buffer to their first coordinate: 2276 2277 \code 2278 im.for_each_element([&](const int *pos) {im(pos) = pos[0];}); 2279 \endcode 2280 2281 * It is also possible to use for_each_element to iterate over entire 2282 * rows or columns by cropping the buffer to a single column or row 2283 * respectively and iterating over elements of the result. For example, 2284 * to set the diagonal of the image to 1 by iterating over the columns: 2285 2286 \code 2287 Buffer<float, 3> im(100, 100, 3); 2288 im.sliced(1, 0).for_each_element([&](int x, int c) { 2289 im(x, x, c) = 1.0f; 2290 }); 2291 \endcode 2292 2293 * Or, assuming the memory layout is known to be dense per row, one can 2294 * memset each row of an image like so: 2295 2296 \code 2297 Buffer<float, 3> im(100, 100, 3); 2298 im.sliced(0, 0).for_each_element([&](int y, int c) { 2299 memset(&im(0, y, c), 0, sizeof(float) * im.width()); 2300 }); 2301 \endcode 2302 2303 */ 2304 // @{ 2305 template<typename Fn> 2306 HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_element(Fn &&f) const { 2307 for_each_element_impl(f); 2308 return *this; 2309 } 2310 2311 template<typename Fn> 2312 HALIDE_ALWAYS_INLINE 2313 Buffer<T, D> & 2314 for_each_element(Fn &&f) { 2315 for_each_element_impl(f); 2316 return *this; 2317 } 2318 // @} 2319 2320 private: 2321 template<typename Fn> 2322 struct FillHelper { 2323 Fn f; 2324 Buffer<T, D> *buf; 2325 2326 template<typename... Args, 2327 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))> 2328 void operator()(Args... args) { 2329 (*buf)(args...) = f(args...); 2330 } 2331 2332 FillHelper(Fn &&f, Buffer<T, D> *buf) 2333 : f(std::forward<Fn>(f)), buf(buf) { 2334 } 2335 }; 2336 2337 public: 2338 /** Fill a buffer by evaluating a callable at every site. The 2339 * callable should look much like a callable passed to 2340 * for_each_element, but it should return the value that should be 2341 * stored to the coordinate corresponding to the arguments. */ 2342 template<typename Fn, 2343 typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type> 2344 Buffer<T, D> &fill(Fn &&f) { 2345 // We'll go via for_each_element. We need a variadic wrapper lambda. 2346 FillHelper<Fn> wrapper(std::forward<Fn>(f), this); 2347 return for_each_element(wrapper); 2348 } 2349 2350 /** Check if an input buffer passed extern stage is a querying 2351 * bounds. Compared to doing the host pointer check directly, 2352 * this both adds clarity to code and will facilitate moving to 2353 * another representation for bounds query arguments. */ 2354 bool is_bounds_query() const { 2355 return buf.is_bounds_query(); 2356 } 2357 2358 /** Convenient check to verify that all of the interesting bytes in the Buffer 2359 * are initialized under MSAN. Note that by default, we use for_each_value() here so that 2360 * we skip any unused padding that isn't part of the Buffer; this isn't efficient, 2361 * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check 2362 * the entire Buffer storage.) */ 2363 void msan_check_mem_is_initialized(bool entire = false) const { 2364 #if defined(__has_feature) 2365 #if __has_feature(memory_sanitizer) 2366 if (entire) { 2367 __msan_check_mem_is_initialized(data(), size_in_bytes()); 2368 } else { 2369 for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; }); 2370 } 2371 #endif 2372 #endif 2373 } 2374 }; 2375 2376 } // namespace Runtime 2377 } // namespace Halide 2378 2379 #undef HALIDE_ALLOCA 2380 2381 #endif // HALIDE_RUNTIME_IMAGE_H 2382