1 /*
2  * Copyright (c) 2017 Intel Corporation
3  * SPDX-License-Identifier: BSD-2-Clause
4  */
5 
6 #include <vector>
7 #include <assert.h>
8 #include <algorithm>
9 #include <cmath>
10 #include <iostream>
11 #include <stdio.h>
12 #include "gufunc_scheduler.h"
13 
14 // round not available on VS2010.
guround(double number)15 double guround (double number) {
16 	return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5);
17 }
18 
19 class RangeActual {
20 public:
21     std::vector<intp> start, end;
22 
RangeActual()23     RangeActual() {}
24 
RangeActual(intp s,intp e)25     RangeActual(intp s, intp e) {
26         start.push_back(s);
27         end.push_back(e);
28     }
29 
RangeActual(const std::vector<intp> & s,const std::vector<intp> & e)30     RangeActual(const std::vector<intp> &s, const std::vector<intp> &e) {
31         assert(s.size() == e.size());
32         start = s;
33         end = e;
34     }
35 
RangeActual(const std::vector<intp> & lens)36     RangeActual(const std::vector<intp> &lens) {
37         for(uintp i = 0; i < lens.size(); ++i) {
38             start.push_back(0);
39             end.push_back(lens[i] - 1);
40         }
41     }
42 
RangeActual(uintp num_dims,intp * lens)43     RangeActual(uintp num_dims, intp *lens) {
44         for(uintp i = 0; i < num_dims; ++i) {
45             start.push_back(0);
46             end.push_back(lens[i] - 1);
47         }
48     }
49 
RangeActual(uintp num_dims,intp * starts,intp * ends)50     RangeActual(uintp num_dims, intp *starts, intp *ends) {
51         for(uintp i = 0; i < num_dims; ++i) {
52             start.push_back(starts[i]);
53             end.push_back(ends[i]);
54         }
55     }
56 
ndim() const57     uintp ndim() const {
58         return start.size();
59     }
60 
iters_per_dim() const61     std::vector<intp> iters_per_dim() const {
62         std::vector<intp> ret;
63         for(uintp i = 0; i < start.size(); ++i) {
64 			intp ret_val = end[i] - start[i] + 1;
65 			if(end[i] < start[i])
66 				ret_val = 0;
67             ret.push_back(ret_val);
68         }
69         return ret;
70     }
71 };
72 
73 class dimlength {
74 public:
75     uintp dim;
76     intp length;
dimlength(uintp d,intp l)77     dimlength(uintp d, intp l) : dim(d), length(l) {}
78 };
79 
80 struct dimlength_by_dim {
operator ()dimlength_by_dim81     bool operator()(const dimlength &a, const dimlength &b) const {
82         return a.dim < b.dim;
83     }
84 };
85 
86 struct dimlength_by_length_reverse {
operator ()dimlength_by_length_reverse87     bool operator()(const dimlength &a, const dimlength &b) const {
88         return a.length > b.length;
89     }
90 };
91 
92 class isf_range {
93 public:
94     uintp dim;
95     intp lower_bound, upper_bound;
isf_range(uintp d,intp l,intp u)96     isf_range(uintp d, intp l, intp u) : dim(d), lower_bound(l), upper_bound(u) {}
97 };
98 
99 struct isf_range_by_dim {
operator ()isf_range_by_dim100     bool operator()(const isf_range &a, const isf_range &b) const {
101         return a.dim < b.dim;
102     }
103 };
104 
105 /*
106  * m_a is the current start of the partition.
107  * m_b is the current end of the partition.
108  * m_c is the start of the next partition.
109  */
110 class chunk_info {
111 public:
112     intp m_a, m_b, m_c;
chunk_info(intp a,intp b,intp c)113     chunk_info(intp a, intp b, intp c) : m_a(a), m_b(b), m_c(c) {}
114 };
115 
116 /*
117  * Split a space starting at rs and ending at re into "divisions" parts.
118  */
chunk(intp rs,intp re,intp divisions)119 chunk_info chunk(intp rs, intp re, intp divisions) {
120     assert(divisions >= 1);
121     intp total = (re - rs) + 1;
122     // If only one division then everything goes into that division.
123     if( divisions == 1) {
124         return chunk_info(rs, re, re + 1);
125     } else {
126         intp len = total / divisions;
127         intp res_end = rs + len - 1;
128         // Return the first division by starting at the beginning (rs) and going to
129         // the remaining length divided by the number of divisions.
130         return chunk_info(rs, res_end, res_end + 1);
131     }
132 }
133 
equalizing_chunk(intp rs,intp re,intp divisions,float thread_percent)134 chunk_info equalizing_chunk(intp rs, intp re, intp divisions, float thread_percent) {
135     assert(divisions >= 1);
136     intp total = (re - rs) + 1;
137     if (divisions == 1) {
138         return chunk_info(rs, re, re + 1);
139     }
140     else {
141         intp len = total * thread_percent;
142         intp res_end = rs + len - 1;
143         return chunk_info(rs, res_end, res_end + 1);
144     }
145 }
146 
isfRangeToActual(const std::vector<isf_range> & build)147 RangeActual isfRangeToActual(const std::vector<isf_range> &build) {
148     std::vector<isf_range> bunsort(build);
149     std::sort(bunsort.begin(), bunsort.end(), isf_range_by_dim());
150     std::vector<intp> lower_bounds(bunsort.size()), upper_bounds(bunsort.size());
151     for(uintp i = 0; i < bunsort.size(); ++i) {
152         lower_bounds[i] = bunsort[i].lower_bound;
153         upper_bounds[i] = bunsort[i].upper_bound;
154     }
155     return RangeActual(lower_bounds, upper_bounds);
156 }
157 
158 /*
159  * Does the main work of splitting the iteration space between threads.
160  * In general, we start by allocating a number of threads to handle the largest dimension
161  * then call the routine recursively to allocate threads to the next largest dimension
162  * and so one.
163  */
divide_work(const RangeActual & full_iteration_space,std::vector<RangeActual> & assignments,std::vector<isf_range> & build,uintp start_thread,uintp end_thread,const std::vector<dimlength> & dims,uintp index)164 void divide_work(const RangeActual &full_iteration_space,
165                  std::vector<RangeActual> &assignments,
166                  std::vector<isf_range> &build,
167                  uintp start_thread,
168                  uintp end_thread,
169                  const std::vector<dimlength> &dims,
170                  uintp index) {
171     // Number of threads used for this dimension.
172     uintp num_threads = (end_thread - start_thread) + 1;
173 
174     assert(num_threads >= 1);
175     // If there is only one thread left then it gets all the remaining work.
176     if(num_threads == 1) {
177         assert(build.size() <= dims.size());
178 
179         // build holds the ongoing constructed range of iterations in each dimension.
180         // If the length of build is the number of dims then we have a complete allocation
181         // so store it in assignments.
182         if(build.size() == dims.size()) {
183             assignments[start_thread] = isfRangeToActual(build);
184         } else {
185             // There are still more dimensions to add.
186             // Create a copy of the incoming build.
187             std::vector<isf_range> new_build(build.begin()+0, build.begin()+index);
188             // Add an entry to new_build for this thread to handle the entire current dimension.
189             new_build.push_back(isf_range(dims[index].dim, full_iteration_space.start[dims[index].dim], full_iteration_space.end[dims[index].dim]));
190             // Recursively process.
191             divide_work(full_iteration_space, assignments, new_build, start_thread, end_thread, dims, index+1);
192         }
193     } else {
194         // There is more than 1 thread for handling this dimension so need to split the dimension between the threads.
195         assert(index < dims.size());
196         intp total_len = 0;
197         // Compute the total number of iterations in the remaining dimensions to be processed, including the current one.
198         for(uintp i = index; i < dims.size(); ++i) total_len += dims[i].length > 1 ? dims[i].length : 0;
199         uintp divisions_for_this_dim;
200         if(total_len == 0) {
201             divisions_for_this_dim = num_threads;
202         } else {
203             // We allocate the remaining threads proportionally to the ratio of the current dimension length to the total.
204             divisions_for_this_dim = intp(guround(num_threads * ((float)dims[index].length / total_len)));
205         }
206 
207         // These are used to divide the iteration space.
208         intp chunkstart = full_iteration_space.start[dims[index].dim];
209         intp chunkend   = full_iteration_space.end[dims[index].dim];
210 
211         // These are used to divide threads.
212         intp threadstart = start_thread;
213         intp threadend   = end_thread;
214 
215         // for each division of the current dimension...
216         for(uintp i = 0; i < divisions_for_this_dim; ++i) {
217             chunk_info chunk_thread = chunk(threadstart, threadend, divisions_for_this_dim - i);
218             // Number of threads used for this division.
219             uintp threads_used_here = (1 + (chunk_thread.m_b - chunk_thread.m_a));
220             chunk_info chunk_index = equalizing_chunk(chunkstart, chunkend, divisions_for_this_dim - i, threads_used_here / (float)num_threads);
221             // Remember that the next division has threads_used_here fewer threads to allocate.
222             num_threads -= threads_used_here;
223             // m_c contains the next start value so update the iteration space and thread space in preparation for next iteration of this loop.
224             chunkstart = chunk_index.m_c;
225             threadstart = chunk_thread.m_c;
226             // Copy the incoming build to new_build.
227             std::vector<isf_range> new_build(build.begin()+0, build.begin()+index);
228             // Add this dimension to new_build to handle start=m_a to end=m_b.
229             new_build.push_back(isf_range(dims[index].dim, chunk_index.m_a, chunk_index.m_b));
230             // Recursively process the next dimension.
231             divide_work(full_iteration_space, assignments, new_build, chunk_thread.m_a, chunk_thread.m_b, dims, index+1);
232         }
233     }
234 }
235 
236 /*
237  * Convert from internal format of vector of ranges to a flattened 2D-array usable by Python.
238  */
239 template<class T>
flatten_schedule(const std::vector<RangeActual> & sched,T * out_sched)240 void flatten_schedule(const std::vector<RangeActual> &sched, T *out_sched) {
241     uintp outer = sched.size();
242     uintp inner = sched[0].start.size();
243     for(uintp i = 0; i < outer; ++i) {
244         for(uintp j = 0; j < inner; ++j) {
245             out_sched[(i*inner*2) + j] = sched[i].start[j];
246         }
247         for(uintp j = 0; j < inner; ++j) {
248             out_sched[(i*inner*2) + j + inner] = sched[i].end[j];
249         }
250     }
251 }
252 
253 /*
254  * Main routine that computes a static schedule.
255  * full_space is the iteration space in each dimension.
256  * num_sched is the number of worker threads.
257  */
create_schedule(const RangeActual & full_space,uintp num_sched)258 std::vector<RangeActual> create_schedule(const RangeActual &full_space, uintp num_sched) {
259     // Compute the number of iterations to be run for each dimension.
260     std::vector<intp> ipd = full_space.iters_per_dim();
261 
262     // We special-case one dimensional.
263     if(full_space.ndim() == 1) {
264         // Get the number of iterations for the single dimension.
265         intp ra_len = ipd[0];
266         // If there are fewer iterations for the single dimension than there are threads...
267         if(ra_len < 0 || (uintp)ra_len <= num_sched) {
268             std::vector<RangeActual> ret;
269             for(uintp i = 0; i < num_sched; ++i) {
270                 // If the amount of iterations is less than the current thread then give it no work,
271                 // signified by start of 1 and end of 0.
272                 if(ra_len < 0 || (uintp)ra_len <= i) {
273                     ret.push_back(RangeActual((intp)1, (intp)0));
274                 } else {
275                     // Give just i'th iteration to thread i.
276                     ret.push_back(RangeActual(full_space.start[0] + i, full_space.start[0] + i));
277                 }
278             }
279             return ret;
280         } else {
281             // There are more iterations than threads.
282             std::vector<RangeActual> ret;
283             // cur holds the next unallocated iteration.
284             intp cur = 0;
285             // For each thread...
286             for(uintp i = 0; i < num_sched; ++i) {
287                 // Compute the number of items to do in this thread as
288                 // the floor of the amount of work left (ra_len-cur) divided
289                 // by the number of threads left to which to allocate work.
290                 intp ilen = ((ra_len-cur-1) / (num_sched-i)) + 1;
291 
292                 // Compute the start iteration number for that thread as the start iteration
293                 // plus the modal number of iterations times the thread number.
294                 intp start = full_space.start[0] + cur;
295                 intp end;
296                 // If this isn't the last thread then the end iteration number is one less
297                 // than the start iteration number of the next thread.  If it is the last
298                 // thread then assign all remaining iterations to it.
299                 if(i < num_sched-1) {
300                     end = full_space.start[0] + (cur + ilen) - 1;
301                 } else {
302                     end = full_space.end[0];
303                 }
304                 // Record the iteration start and end in the schedule.
305                 ret.push_back(RangeActual(start, end));
306                 // Update the next unallocated iteration.
307                 cur += ilen;
308             }
309             return ret;
310         }
311     } else {
312         // Two or more dimensions are handled generically here.
313         std::vector<dimlength> dims;
314         // Create a vector of objects associating dimensional index to length.
315         for(uintp i = 0; i < ipd.size(); ++i) dims.push_back(dimlength(i, ipd[i]));
316         // Sort the dimensions in the reverse order of their length.
317         std::sort(dims.begin(), dims.end(), dimlength_by_length_reverse());
318         std::vector<RangeActual> assignments(num_sched, RangeActual((intp)1,(intp)0));
319         std::vector<isf_range> build;
320         // Compute the division of work across dimensions and threads.
321         divide_work(full_space, assignments, build, 0, num_sched-1, dims, 0);
322         return assignments;
323     }
324 }
325 
326 /*
327     num_dim (D) is the number of dimensions of the iteration space.
328     starts is the range-start of each of those dimensions, inclusive.
329     ends is the range-end of each of those dimensions, inclusive.
330     num_threads is the number (N) of chunks to break the iteration space into
331     sched is pre-allocated memory for the schedule to be stored in and is of size NxD.
332     debug is non-zero if DEBUG_ARRAY_OPT is turned on.
333 */
do_scheduling_signed(uintp num_dim,intp * starts,intp * ends,uintp num_threads,intp * sched,intp debug)334 extern "C" void do_scheduling_signed(uintp num_dim, intp *starts, intp *ends, uintp num_threads, intp *sched, intp debug) {
335     if (debug) {
336         printf("do_scheduling_signed\n");
337         printf("num_dim = %d\n", (int)num_dim);
338         printf("ranges = (");
339         for (unsigned i = 0; i < num_dim; i++) {
340             printf("[%d, %d], ", (int)starts[i], (int)ends[i]);
341         }
342         printf(")\n");
343         printf("num_threads = %d\n", (int)num_threads);
344     }
345 
346     if (num_threads == 0) return;
347 
348     RangeActual full_space(num_dim, starts, ends);
349     std::vector<RangeActual> ret = create_schedule(full_space, num_threads);
350     flatten_schedule(ret, sched);
351 }
352 
do_scheduling_unsigned(uintp num_dim,intp * starts,intp * ends,uintp num_threads,uintp * sched,intp debug)353 extern "C" void do_scheduling_unsigned(uintp num_dim, intp *starts, intp *ends, uintp num_threads, uintp *sched, intp debug) {
354     if (debug) {
355         printf("do_scheduling_unsigned\n");
356         printf("num_dim = %d\n", (int)num_dim);
357         printf("ranges = (");
358         for (unsigned i = 0; i < num_dim; i++) {
359             printf("[%d, %d], ", (int)starts[i], (int)ends[i]);
360         }
361         printf(")\n");
362         printf("num_threads = %d\n", (int)num_threads);
363     }
364 
365     if (num_threads == 0) return;
366 
367     RangeActual full_space(num_dim, starts, ends);
368     std::vector<RangeActual> ret = create_schedule(full_space, num_threads);
369     flatten_schedule(ret, sched);
370 }
371