1 #ifndef HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
2 #define HALIDE_RUNTIME_DEVICE_BUFFER_UTILS_H
3 
4 #include "HalideRuntime.h"
5 #include "device_interface.h"
6 #include "printer.h"
7 
8 namespace Halide {
9 namespace Runtime {
10 namespace Internal {
11 
12 // A host <-> dev copy should be done with the fewest possible number
13 // of contiguous copies to minimize driver overhead. If our
14 // halide_buffer_t has strides larger than its extents (e.g. because
15 // it represents a sub-region of a larger halide_buffer_t) we can't
16 // safely copy it back and forth using a single contiguous copy,
17 // because we'd clobber in-between values that another thread might be
18 // using.  In the best case we can do a single contiguous copy, but in
19 // the worst case we need to individually copy over every pixel.
20 //
21 // This problem is made extra difficult by the fact that the ordering
22 // of the dimensions in a halide_buffer_t doesn't relate to memory layout at
23 // all, so the strides could be in any order.
24 //
25 // We solve it by representing a copy job we need to perform as a
26 // device_copy struct. It describes a multi-dimensional array of
27 // copies to perform. Initially it describes copying over a single
28 // pixel at a time. We then try to discover contiguous groups of
29 // copies that can be coalesced into a single larger copy.
30 
31 // The struct that describes a host <-> dev copy to perform.
32 #define MAX_COPY_DIMS 16
33 struct device_copy {
34     // opaque handles for source and device memory.
35     uint64_t src, dst;
36     // The offset in the source memory to start
37     uint64_t src_begin;
38     // The multidimensional array of contiguous copy tasks that need to be done.
39     uint64_t extent[MAX_COPY_DIMS];
40     // The strides (in bytes) that separate adjacent copy tasks in each dimension.
41     uint64_t src_stride_bytes[MAX_COPY_DIMS];
42     uint64_t dst_stride_bytes[MAX_COPY_DIMS];
43     // How many contiguous bytes to copy per task
44     uint64_t chunk_size;
45 };
46 
copy_memory_helper(const device_copy & copy,int d,int64_t src_off,int64_t dst_off)47 WEAK void copy_memory_helper(const device_copy &copy, int d, int64_t src_off, int64_t dst_off) {
48     // Skip size-1 dimensions
49     while (d >= 0 && copy.extent[d] == 1)
50         d--;
51 
52     if (d == -1) {
53         const void *from = (void *)(copy.src + src_off);
54         void *to = (void *)(copy.dst + dst_off);
55         memcpy(to, from, copy.chunk_size);
56     } else {
57         for (uint64_t i = 0; i < copy.extent[d]; i++) {
58             copy_memory_helper(copy, d - 1, src_off, dst_off);
59             src_off += copy.src_stride_bytes[d];
60             dst_off += copy.dst_stride_bytes[d];
61         }
62     }
63 }
64 
copy_memory(const device_copy & copy,void * user_context)65 WEAK void copy_memory(const device_copy &copy, void *user_context) {
66     // If this is a zero copy buffer, these pointers will be the same.
67     if (copy.src != copy.dst) {
68         copy_memory_helper(copy, MAX_COPY_DIMS - 1, copy.src_begin, 0);
69     } else {
70         debug(user_context) << "copy_memory: no copy needed as pointers are the same.\n";
71     }
72 }
73 
74 // Fills the entire dst buffer, which must be contained within src
make_buffer_copy(const halide_buffer_t * src,bool src_host,const halide_buffer_t * dst,bool dst_host)75 WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
76                                   const halide_buffer_t *dst, bool dst_host) {
77     // Make a copy job representing copying the first pixel only.
78     device_copy c;
79     c.src = src_host ? (uint64_t)src->host : src->device;
80     c.dst = dst_host ? (uint64_t)dst->host : dst->device;
81     c.chunk_size = src->type.bytes();
82     for (int i = 0; i < MAX_COPY_DIMS; i++) {
83         c.extent[i] = 1;
84         c.src_stride_bytes[i] = 0;
85         c.dst_stride_bytes[i] = 0;
86     }
87 
88     // Offset the src base pointer to the right point in its buffer.
89     c.src_begin = 0;
90     for (int i = 0; i < src->dimensions; i++) {
91         c.src_begin += src->dim[i].stride * (dst->dim[i].min - src->dim[i].min);
92     }
93     c.src_begin *= c.chunk_size;
94 
95     if (src->dimensions != dst->dimensions ||
96         src->type.bytes() != dst->type.bytes() ||
97         dst->dimensions > MAX_COPY_DIMS) {
98         // These conditions should also be checked for outside this fn.
99         device_copy zero = {0};
100         return zero;
101     }
102 
103     if (c.chunk_size == 0) {
104         // This buffer apparently represents no memory. Return a zero'd copy
105         // task.
106         device_copy zero = {0};
107         return zero;
108     }
109 
110     // Now expand it to copy all the pixels (one at a time) by taking
111     // the extents and strides from the halide_buffer_ts. Dimensions
112     // are added to the copy by inserting it such that the stride is
113     // in ascending order in the dst.
114     for (int i = 0; i < dst->dimensions; i++) {
115         // TODO: deal with negative strides.
116         uint64_t dst_stride_bytes = dst->dim[i].stride * dst->type.bytes();
117         uint64_t src_stride_bytes = src->dim[i].stride * src->type.bytes();
118         // Insert the dimension sorted into the buffer copy.
119         int insert;
120         for (insert = 0; insert < i; insert++) {
121             // If the stride is 0, we put it at the end because it can't be
122             // folded.
123             if (dst_stride_bytes < c.dst_stride_bytes[insert] && dst_stride_bytes != 0) {
124                 break;
125             }
126         }
127         for (int j = i; j > insert; j--) {
128             c.extent[j] = c.extent[j - 1];
129             c.dst_stride_bytes[j] = c.dst_stride_bytes[j - 1];
130             c.src_stride_bytes[j] = c.src_stride_bytes[j - 1];
131         }
132         c.extent[insert] = dst->dim[i].extent;
133         // debug(NULL) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
134         c.dst_stride_bytes[insert] = dst_stride_bytes;
135         c.src_stride_bytes[insert] = src_stride_bytes;
136     };
137 
138     // Attempt to fold contiguous dimensions into the chunk
139     // size. Since the dimensions are sorted by stride, and the
140     // strides must be greater than or equal to the chunk size, this
141     // means we can just delete the innermost dimension as long as its
142     // stride in both src and dst is equal to the chunk size.
143     while (c.chunk_size == c.src_stride_bytes[0] &&
144            c.chunk_size == c.dst_stride_bytes[0]) {
145         // Fold the innermost dimension's extent into the chunk_size.
146         c.chunk_size *= c.extent[0];
147 
148         // Erase the innermost dimension from the list of dimensions to
149         // iterate over.
150         for (int j = 1; j < MAX_COPY_DIMS; j++) {
151             c.extent[j - 1] = c.extent[j];
152             c.src_stride_bytes[j - 1] = c.src_stride_bytes[j];
153             c.dst_stride_bytes[j - 1] = c.dst_stride_bytes[j];
154         }
155         c.extent[MAX_COPY_DIMS - 1] = 1;
156         c.src_stride_bytes[MAX_COPY_DIMS - 1] = 0;
157         c.dst_stride_bytes[MAX_COPY_DIMS - 1] = 0;
158     }
159     return c;
160 }
161 
make_host_to_device_copy(const halide_buffer_t * buf)162 WEAK device_copy make_host_to_device_copy(const halide_buffer_t *buf) {
163     return make_buffer_copy(buf, true, buf, false);
164 }
165 
make_device_to_host_copy(const halide_buffer_t * buf)166 WEAK device_copy make_device_to_host_copy(const halide_buffer_t *buf) {
167     return make_buffer_copy(buf, false, buf, true);
168 }
169 
170 // Caller is expected to verify that src->dimensions == dst->dimensions
calc_device_crop_byte_offset(const struct halide_buffer_t * src,struct halide_buffer_t * dst)171 ALWAYS_INLINE int64_t calc_device_crop_byte_offset(const struct halide_buffer_t *src, struct halide_buffer_t *dst) {
172     int64_t offset = 0;
173     for (int i = 0; i < src->dimensions; i++) {
174         offset += (dst->dim[i].min - src->dim[i].min) * src->dim[i].stride;
175     }
176     offset *= src->type.bytes();
177     return offset;
178 }
179 
180 // Caller is expected to verify that src->dimensions == dst->dimensions + 1,
181 // and that slice_dim and slice_pos are valid within src
calc_device_slice_byte_offset(const struct halide_buffer_t * src,int slice_dim,int slice_pos)182 ALWAYS_INLINE int64_t calc_device_slice_byte_offset(const struct halide_buffer_t *src, int slice_dim, int slice_pos) {
183     int64_t offset = (slice_pos - src->dim[slice_dim].min) * src->dim[slice_dim].stride;
184     offset *= src->type.bytes();
185     return offset;
186 }
187 
188 }  // namespace Internal
189 }  // namespace Runtime
190 }  // namespace Halide
191 
192 #endif  // HALIDE_DEVICE_BUFFER_UTILS_H
193