1 /*
2 * Copyright (c) 2017 Intel Corporation
3 * SPDX-License-Identifier: BSD-2-Clause
4 */
5
6 #include <vector>
7 #include <assert.h>
8 #include <algorithm>
9 #include <cmath>
10 #include <iostream>
11 #include <stdio.h>
12 #include "gufunc_scheduler.h"
13
14 // round not available on VS2010.
guround(double number)15 double guround (double number) {
16 return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5);
17 }
18
19 class RangeActual {
20 public:
21 std::vector<intp> start, end;
22
RangeActual()23 RangeActual() {}
24
RangeActual(intp s,intp e)25 RangeActual(intp s, intp e) {
26 start.push_back(s);
27 end.push_back(e);
28 }
29
RangeActual(const std::vector<intp> & s,const std::vector<intp> & e)30 RangeActual(const std::vector<intp> &s, const std::vector<intp> &e) {
31 assert(s.size() == e.size());
32 start = s;
33 end = e;
34 }
35
RangeActual(const std::vector<intp> & lens)36 RangeActual(const std::vector<intp> &lens) {
37 for(uintp i = 0; i < lens.size(); ++i) {
38 start.push_back(0);
39 end.push_back(lens[i] - 1);
40 }
41 }
42
RangeActual(uintp num_dims,intp * lens)43 RangeActual(uintp num_dims, intp *lens) {
44 for(uintp i = 0; i < num_dims; ++i) {
45 start.push_back(0);
46 end.push_back(lens[i] - 1);
47 }
48 }
49
RangeActual(uintp num_dims,intp * starts,intp * ends)50 RangeActual(uintp num_dims, intp *starts, intp *ends) {
51 for(uintp i = 0; i < num_dims; ++i) {
52 start.push_back(starts[i]);
53 end.push_back(ends[i]);
54 }
55 }
56
ndim() const57 uintp ndim() const {
58 return start.size();
59 }
60
iters_per_dim() const61 std::vector<intp> iters_per_dim() const {
62 std::vector<intp> ret;
63 for(uintp i = 0; i < start.size(); ++i) {
64 intp ret_val = end[i] - start[i] + 1;
65 if(end[i] < start[i])
66 ret_val = 0;
67 ret.push_back(ret_val);
68 }
69 return ret;
70 }
71 };
72
73 class dimlength {
74 public:
75 uintp dim;
76 intp length;
dimlength(uintp d,intp l)77 dimlength(uintp d, intp l) : dim(d), length(l) {}
78 };
79
80 struct dimlength_by_dim {
operator ()dimlength_by_dim81 bool operator()(const dimlength &a, const dimlength &b) const {
82 return a.dim < b.dim;
83 }
84 };
85
86 struct dimlength_by_length_reverse {
operator ()dimlength_by_length_reverse87 bool operator()(const dimlength &a, const dimlength &b) const {
88 return a.length > b.length;
89 }
90 };
91
92 class isf_range {
93 public:
94 uintp dim;
95 intp lower_bound, upper_bound;
isf_range(uintp d,intp l,intp u)96 isf_range(uintp d, intp l, intp u) : dim(d), lower_bound(l), upper_bound(u) {}
97 };
98
99 struct isf_range_by_dim {
operator ()isf_range_by_dim100 bool operator()(const isf_range &a, const isf_range &b) const {
101 return a.dim < b.dim;
102 }
103 };
104
105 /*
106 * m_a is the current start of the partition.
107 * m_b is the current end of the partition.
108 * m_c is the start of the next partition.
109 */
110 class chunk_info {
111 public:
112 intp m_a, m_b, m_c;
chunk_info(intp a,intp b,intp c)113 chunk_info(intp a, intp b, intp c) : m_a(a), m_b(b), m_c(c) {}
114 };
115
116 /*
117 * Split a space starting at rs and ending at re into "divisions" parts.
118 */
chunk(intp rs,intp re,intp divisions)119 chunk_info chunk(intp rs, intp re, intp divisions) {
120 assert(divisions >= 1);
121 intp total = (re - rs) + 1;
122 // If only one division then everything goes into that division.
123 if( divisions == 1) {
124 return chunk_info(rs, re, re + 1);
125 } else {
126 intp len = total / divisions;
127 intp res_end = rs + len - 1;
128 // Return the first division by starting at the beginning (rs) and going to
129 // the remaining length divided by the number of divisions.
130 return chunk_info(rs, res_end, res_end + 1);
131 }
132 }
133
equalizing_chunk(intp rs,intp re,intp divisions,float thread_percent)134 chunk_info equalizing_chunk(intp rs, intp re, intp divisions, float thread_percent) {
135 assert(divisions >= 1);
136 intp total = (re - rs) + 1;
137 if (divisions == 1) {
138 return chunk_info(rs, re, re + 1);
139 }
140 else {
141 intp len = total * thread_percent;
142 intp res_end = rs + len - 1;
143 return chunk_info(rs, res_end, res_end + 1);
144 }
145 }
146
isfRangeToActual(const std::vector<isf_range> & build)147 RangeActual isfRangeToActual(const std::vector<isf_range> &build) {
148 std::vector<isf_range> bunsort(build);
149 std::sort(bunsort.begin(), bunsort.end(), isf_range_by_dim());
150 std::vector<intp> lower_bounds(bunsort.size()), upper_bounds(bunsort.size());
151 for(uintp i = 0; i < bunsort.size(); ++i) {
152 lower_bounds[i] = bunsort[i].lower_bound;
153 upper_bounds[i] = bunsort[i].upper_bound;
154 }
155 return RangeActual(lower_bounds, upper_bounds);
156 }
157
158 /*
159 * Does the main work of splitting the iteration space between threads.
160 * In general, we start by allocating a number of threads to handle the largest dimension
161 * then call the routine recursively to allocate threads to the next largest dimension
162 * and so one.
163 */
divide_work(const RangeActual & full_iteration_space,std::vector<RangeActual> & assignments,std::vector<isf_range> & build,uintp start_thread,uintp end_thread,const std::vector<dimlength> & dims,uintp index)164 void divide_work(const RangeActual &full_iteration_space,
165 std::vector<RangeActual> &assignments,
166 std::vector<isf_range> &build,
167 uintp start_thread,
168 uintp end_thread,
169 const std::vector<dimlength> &dims,
170 uintp index) {
171 // Number of threads used for this dimension.
172 uintp num_threads = (end_thread - start_thread) + 1;
173
174 assert(num_threads >= 1);
175 // If there is only one thread left then it gets all the remaining work.
176 if(num_threads == 1) {
177 assert(build.size() <= dims.size());
178
179 // build holds the ongoing constructed range of iterations in each dimension.
180 // If the length of build is the number of dims then we have a complete allocation
181 // so store it in assignments.
182 if(build.size() == dims.size()) {
183 assignments[start_thread] = isfRangeToActual(build);
184 } else {
185 // There are still more dimensions to add.
186 // Create a copy of the incoming build.
187 std::vector<isf_range> new_build(build.begin()+0, build.begin()+index);
188 // Add an entry to new_build for this thread to handle the entire current dimension.
189 new_build.push_back(isf_range(dims[index].dim, full_iteration_space.start[dims[index].dim], full_iteration_space.end[dims[index].dim]));
190 // Recursively process.
191 divide_work(full_iteration_space, assignments, new_build, start_thread, end_thread, dims, index+1);
192 }
193 } else {
194 // There is more than 1 thread for handling this dimension so need to split the dimension between the threads.
195 assert(index < dims.size());
196 intp total_len = 0;
197 // Compute the total number of iterations in the remaining dimensions to be processed, including the current one.
198 for(uintp i = index; i < dims.size(); ++i) total_len += dims[i].length > 1 ? dims[i].length : 0;
199 uintp divisions_for_this_dim;
200 if(total_len == 0) {
201 divisions_for_this_dim = num_threads;
202 } else {
203 // We allocate the remaining threads proportionally to the ratio of the current dimension length to the total.
204 divisions_for_this_dim = intp(guround(num_threads * ((float)dims[index].length / total_len)));
205 }
206
207 // These are used to divide the iteration space.
208 intp chunkstart = full_iteration_space.start[dims[index].dim];
209 intp chunkend = full_iteration_space.end[dims[index].dim];
210
211 // These are used to divide threads.
212 intp threadstart = start_thread;
213 intp threadend = end_thread;
214
215 // for each division of the current dimension...
216 for(uintp i = 0; i < divisions_for_this_dim; ++i) {
217 chunk_info chunk_thread = chunk(threadstart, threadend, divisions_for_this_dim - i);
218 // Number of threads used for this division.
219 uintp threads_used_here = (1 + (chunk_thread.m_b - chunk_thread.m_a));
220 chunk_info chunk_index = equalizing_chunk(chunkstart, chunkend, divisions_for_this_dim - i, threads_used_here / (float)num_threads);
221 // Remember that the next division has threads_used_here fewer threads to allocate.
222 num_threads -= threads_used_here;
223 // m_c contains the next start value so update the iteration space and thread space in preparation for next iteration of this loop.
224 chunkstart = chunk_index.m_c;
225 threadstart = chunk_thread.m_c;
226 // Copy the incoming build to new_build.
227 std::vector<isf_range> new_build(build.begin()+0, build.begin()+index);
228 // Add this dimension to new_build to handle start=m_a to end=m_b.
229 new_build.push_back(isf_range(dims[index].dim, chunk_index.m_a, chunk_index.m_b));
230 // Recursively process the next dimension.
231 divide_work(full_iteration_space, assignments, new_build, chunk_thread.m_a, chunk_thread.m_b, dims, index+1);
232 }
233 }
234 }
235
236 /*
237 * Convert from internal format of vector of ranges to a flattened 2D-array usable by Python.
238 */
239 template<class T>
flatten_schedule(const std::vector<RangeActual> & sched,T * out_sched)240 void flatten_schedule(const std::vector<RangeActual> &sched, T *out_sched) {
241 uintp outer = sched.size();
242 uintp inner = sched[0].start.size();
243 for(uintp i = 0; i < outer; ++i) {
244 for(uintp j = 0; j < inner; ++j) {
245 out_sched[(i*inner*2) + j] = sched[i].start[j];
246 }
247 for(uintp j = 0; j < inner; ++j) {
248 out_sched[(i*inner*2) + j + inner] = sched[i].end[j];
249 }
250 }
251 }
252
253 /*
254 * Main routine that computes a static schedule.
255 * full_space is the iteration space in each dimension.
256 * num_sched is the number of worker threads.
257 */
create_schedule(const RangeActual & full_space,uintp num_sched)258 std::vector<RangeActual> create_schedule(const RangeActual &full_space, uintp num_sched) {
259 // Compute the number of iterations to be run for each dimension.
260 std::vector<intp> ipd = full_space.iters_per_dim();
261
262 // We special-case one dimensional.
263 if(full_space.ndim() == 1) {
264 // Get the number of iterations for the single dimension.
265 intp ra_len = ipd[0];
266 // If there are fewer iterations for the single dimension than there are threads...
267 if(ra_len < 0 || (uintp)ra_len <= num_sched) {
268 std::vector<RangeActual> ret;
269 for(uintp i = 0; i < num_sched; ++i) {
270 // If the amount of iterations is less than the current thread then give it no work,
271 // signified by start of 1 and end of 0.
272 if(ra_len < 0 || (uintp)ra_len <= i) {
273 ret.push_back(RangeActual((intp)1, (intp)0));
274 } else {
275 // Give just i'th iteration to thread i.
276 ret.push_back(RangeActual(full_space.start[0] + i, full_space.start[0] + i));
277 }
278 }
279 return ret;
280 } else {
281 // There are more iterations than threads.
282 std::vector<RangeActual> ret;
283 // cur holds the next unallocated iteration.
284 intp cur = 0;
285 // For each thread...
286 for(uintp i = 0; i < num_sched; ++i) {
287 // Compute the number of items to do in this thread as
288 // the floor of the amount of work left (ra_len-cur) divided
289 // by the number of threads left to which to allocate work.
290 intp ilen = ((ra_len-cur-1) / (num_sched-i)) + 1;
291
292 // Compute the start iteration number for that thread as the start iteration
293 // plus the modal number of iterations times the thread number.
294 intp start = full_space.start[0] + cur;
295 intp end;
296 // If this isn't the last thread then the end iteration number is one less
297 // than the start iteration number of the next thread. If it is the last
298 // thread then assign all remaining iterations to it.
299 if(i < num_sched-1) {
300 end = full_space.start[0] + (cur + ilen) - 1;
301 } else {
302 end = full_space.end[0];
303 }
304 // Record the iteration start and end in the schedule.
305 ret.push_back(RangeActual(start, end));
306 // Update the next unallocated iteration.
307 cur += ilen;
308 }
309 return ret;
310 }
311 } else {
312 // Two or more dimensions are handled generically here.
313 std::vector<dimlength> dims;
314 // Create a vector of objects associating dimensional index to length.
315 for(uintp i = 0; i < ipd.size(); ++i) dims.push_back(dimlength(i, ipd[i]));
316 // Sort the dimensions in the reverse order of their length.
317 std::sort(dims.begin(), dims.end(), dimlength_by_length_reverse());
318 std::vector<RangeActual> assignments(num_sched, RangeActual((intp)1,(intp)0));
319 std::vector<isf_range> build;
320 // Compute the division of work across dimensions and threads.
321 divide_work(full_space, assignments, build, 0, num_sched-1, dims, 0);
322 return assignments;
323 }
324 }
325
326 /*
327 num_dim (D) is the number of dimensions of the iteration space.
328 starts is the range-start of each of those dimensions, inclusive.
329 ends is the range-end of each of those dimensions, inclusive.
330 num_threads is the number (N) of chunks to break the iteration space into
331 sched is pre-allocated memory for the schedule to be stored in and is of size NxD.
332 debug is non-zero if DEBUG_ARRAY_OPT is turned on.
333 */
do_scheduling_signed(uintp num_dim,intp * starts,intp * ends,uintp num_threads,intp * sched,intp debug)334 extern "C" void do_scheduling_signed(uintp num_dim, intp *starts, intp *ends, uintp num_threads, intp *sched, intp debug) {
335 if (debug) {
336 printf("do_scheduling_signed\n");
337 printf("num_dim = %d\n", (int)num_dim);
338 printf("ranges = (");
339 for (unsigned i = 0; i < num_dim; i++) {
340 printf("[%d, %d], ", (int)starts[i], (int)ends[i]);
341 }
342 printf(")\n");
343 printf("num_threads = %d\n", (int)num_threads);
344 }
345
346 if (num_threads == 0) return;
347
348 RangeActual full_space(num_dim, starts, ends);
349 std::vector<RangeActual> ret = create_schedule(full_space, num_threads);
350 flatten_schedule(ret, sched);
351 }
352
do_scheduling_unsigned(uintp num_dim,intp * starts,intp * ends,uintp num_threads,uintp * sched,intp debug)353 extern "C" void do_scheduling_unsigned(uintp num_dim, intp *starts, intp *ends, uintp num_threads, uintp *sched, intp debug) {
354 if (debug) {
355 printf("do_scheduling_unsigned\n");
356 printf("num_dim = %d\n", (int)num_dim);
357 printf("ranges = (");
358 for (unsigned i = 0; i < num_dim; i++) {
359 printf("[%d, %d], ", (int)starts[i], (int)ends[i]);
360 }
361 printf(")\n");
362 printf("num_threads = %d\n", (int)num_threads);
363 }
364
365 if (num_threads == 0) return;
366
367 RangeActual full_space(num_dim, starts, ends);
368 std::vector<RangeActual> ret = create_schedule(full_space, num_threads);
369 flatten_schedule(ret, sched);
370 }
371