/* * Copyright (c) 2017 Intel Corporation * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include "gufunc_scheduler.h" // round not available on VS2010. double guround (double number) { return number < 0.0 ? ceil(number - 0.5) : floor(number + 0.5); } class RangeActual { public: std::vector start, end; RangeActual() {} RangeActual(intp s, intp e) { start.push_back(s); end.push_back(e); } RangeActual(const std::vector &s, const std::vector &e) { assert(s.size() == e.size()); start = s; end = e; } RangeActual(const std::vector &lens) { for(uintp i = 0; i < lens.size(); ++i) { start.push_back(0); end.push_back(lens[i] - 1); } } RangeActual(uintp num_dims, intp *lens) { for(uintp i = 0; i < num_dims; ++i) { start.push_back(0); end.push_back(lens[i] - 1); } } RangeActual(uintp num_dims, intp *starts, intp *ends) { for(uintp i = 0; i < num_dims; ++i) { start.push_back(starts[i]); end.push_back(ends[i]); } } uintp ndim() const { return start.size(); } std::vector iters_per_dim() const { std::vector ret; for(uintp i = 0; i < start.size(); ++i) { intp ret_val = end[i] - start[i] + 1; if(end[i] < start[i]) ret_val = 0; ret.push_back(ret_val); } return ret; } }; class dimlength { public: uintp dim; intp length; dimlength(uintp d, intp l) : dim(d), length(l) {} }; struct dimlength_by_dim { bool operator()(const dimlength &a, const dimlength &b) const { return a.dim < b.dim; } }; struct dimlength_by_length_reverse { bool operator()(const dimlength &a, const dimlength &b) const { return a.length > b.length; } }; class isf_range { public: uintp dim; intp lower_bound, upper_bound; isf_range(uintp d, intp l, intp u) : dim(d), lower_bound(l), upper_bound(u) {} }; struct isf_range_by_dim { bool operator()(const isf_range &a, const isf_range &b) const { return a.dim < b.dim; } }; /* * m_a is the current start of the partition. * m_b is the current end of the partition. * m_c is the start of the next partition. */ class chunk_info { public: intp m_a, m_b, m_c; chunk_info(intp a, intp b, intp c) : m_a(a), m_b(b), m_c(c) {} }; /* * Split a space starting at rs and ending at re into "divisions" parts. */ chunk_info chunk(intp rs, intp re, intp divisions) { assert(divisions >= 1); intp total = (re - rs) + 1; // If only one division then everything goes into that division. if( divisions == 1) { return chunk_info(rs, re, re + 1); } else { intp len = total / divisions; intp res_end = rs + len - 1; // Return the first division by starting at the beginning (rs) and going to // the remaining length divided by the number of divisions. return chunk_info(rs, res_end, res_end + 1); } } chunk_info equalizing_chunk(intp rs, intp re, intp divisions, float thread_percent) { assert(divisions >= 1); intp total = (re - rs) + 1; if (divisions == 1) { return chunk_info(rs, re, re + 1); } else { intp len = total * thread_percent; intp res_end = rs + len - 1; return chunk_info(rs, res_end, res_end + 1); } } RangeActual isfRangeToActual(const std::vector &build) { std::vector bunsort(build); std::sort(bunsort.begin(), bunsort.end(), isf_range_by_dim()); std::vector lower_bounds(bunsort.size()), upper_bounds(bunsort.size()); for(uintp i = 0; i < bunsort.size(); ++i) { lower_bounds[i] = bunsort[i].lower_bound; upper_bounds[i] = bunsort[i].upper_bound; } return RangeActual(lower_bounds, upper_bounds); } /* * Does the main work of splitting the iteration space between threads. * In general, we start by allocating a number of threads to handle the largest dimension * then call the routine recursively to allocate threads to the next largest dimension * and so one. */ void divide_work(const RangeActual &full_iteration_space, std::vector &assignments, std::vector &build, uintp start_thread, uintp end_thread, const std::vector &dims, uintp index) { // Number of threads used for this dimension. uintp num_threads = (end_thread - start_thread) + 1; assert(num_threads >= 1); // If there is only one thread left then it gets all the remaining work. if(num_threads == 1) { assert(build.size() <= dims.size()); // build holds the ongoing constructed range of iterations in each dimension. // If the length of build is the number of dims then we have a complete allocation // so store it in assignments. if(build.size() == dims.size()) { assignments[start_thread] = isfRangeToActual(build); } else { // There are still more dimensions to add. // Create a copy of the incoming build. std::vector new_build(build.begin()+0, build.begin()+index); // Add an entry to new_build for this thread to handle the entire current dimension. new_build.push_back(isf_range(dims[index].dim, full_iteration_space.start[dims[index].dim], full_iteration_space.end[dims[index].dim])); // Recursively process. divide_work(full_iteration_space, assignments, new_build, start_thread, end_thread, dims, index+1); } } else { // There is more than 1 thread for handling this dimension so need to split the dimension between the threads. assert(index < dims.size()); intp total_len = 0; // Compute the total number of iterations in the remaining dimensions to be processed, including the current one. for(uintp i = index; i < dims.size(); ++i) total_len += dims[i].length > 1 ? dims[i].length : 0; uintp divisions_for_this_dim; if(total_len == 0) { divisions_for_this_dim = num_threads; } else { // We allocate the remaining threads proportionally to the ratio of the current dimension length to the total. divisions_for_this_dim = intp(guround(num_threads * ((float)dims[index].length / total_len))); } // These are used to divide the iteration space. intp chunkstart = full_iteration_space.start[dims[index].dim]; intp chunkend = full_iteration_space.end[dims[index].dim]; // These are used to divide threads. intp threadstart = start_thread; intp threadend = end_thread; // for each division of the current dimension... for(uintp i = 0; i < divisions_for_this_dim; ++i) { chunk_info chunk_thread = chunk(threadstart, threadend, divisions_for_this_dim - i); // Number of threads used for this division. uintp threads_used_here = (1 + (chunk_thread.m_b - chunk_thread.m_a)); chunk_info chunk_index = equalizing_chunk(chunkstart, chunkend, divisions_for_this_dim - i, threads_used_here / (float)num_threads); // Remember that the next division has threads_used_here fewer threads to allocate. num_threads -= threads_used_here; // m_c contains the next start value so update the iteration space and thread space in preparation for next iteration of this loop. chunkstart = chunk_index.m_c; threadstart = chunk_thread.m_c; // Copy the incoming build to new_build. std::vector new_build(build.begin()+0, build.begin()+index); // Add this dimension to new_build to handle start=m_a to end=m_b. new_build.push_back(isf_range(dims[index].dim, chunk_index.m_a, chunk_index.m_b)); // Recursively process the next dimension. divide_work(full_iteration_space, assignments, new_build, chunk_thread.m_a, chunk_thread.m_b, dims, index+1); } } } /* * Convert from internal format of vector of ranges to a flattened 2D-array usable by Python. */ template void flatten_schedule(const std::vector &sched, T *out_sched) { uintp outer = sched.size(); uintp inner = sched[0].start.size(); for(uintp i = 0; i < outer; ++i) { for(uintp j = 0; j < inner; ++j) { out_sched[(i*inner*2) + j] = sched[i].start[j]; } for(uintp j = 0; j < inner; ++j) { out_sched[(i*inner*2) + j + inner] = sched[i].end[j]; } } } /* * Main routine that computes a static schedule. * full_space is the iteration space in each dimension. * num_sched is the number of worker threads. */ std::vector create_schedule(const RangeActual &full_space, uintp num_sched) { // Compute the number of iterations to be run for each dimension. std::vector ipd = full_space.iters_per_dim(); // We special-case one dimensional. if(full_space.ndim() == 1) { // Get the number of iterations for the single dimension. intp ra_len = ipd[0]; // If there are fewer iterations for the single dimension than there are threads... if(ra_len < 0 || (uintp)ra_len <= num_sched) { std::vector ret; for(uintp i = 0; i < num_sched; ++i) { // If the amount of iterations is less than the current thread then give it no work, // signified by start of 1 and end of 0. if(ra_len < 0 || (uintp)ra_len <= i) { ret.push_back(RangeActual((intp)1, (intp)0)); } else { // Give just i'th iteration to thread i. ret.push_back(RangeActual(full_space.start[0] + i, full_space.start[0] + i)); } } return ret; } else { // There are more iterations than threads. std::vector ret; // cur holds the next unallocated iteration. intp cur = 0; // For each thread... for(uintp i = 0; i < num_sched; ++i) { // Compute the number of items to do in this thread as // the floor of the amount of work left (ra_len-cur) divided // by the number of threads left to which to allocate work. intp ilen = ((ra_len-cur-1) / (num_sched-i)) + 1; // Compute the start iteration number for that thread as the start iteration // plus the modal number of iterations times the thread number. intp start = full_space.start[0] + cur; intp end; // If this isn't the last thread then the end iteration number is one less // than the start iteration number of the next thread. If it is the last // thread then assign all remaining iterations to it. if(i < num_sched-1) { end = full_space.start[0] + (cur + ilen) - 1; } else { end = full_space.end[0]; } // Record the iteration start and end in the schedule. ret.push_back(RangeActual(start, end)); // Update the next unallocated iteration. cur += ilen; } return ret; } } else { // Two or more dimensions are handled generically here. std::vector dims; // Create a vector of objects associating dimensional index to length. for(uintp i = 0; i < ipd.size(); ++i) dims.push_back(dimlength(i, ipd[i])); // Sort the dimensions in the reverse order of their length. std::sort(dims.begin(), dims.end(), dimlength_by_length_reverse()); std::vector assignments(num_sched, RangeActual((intp)1,(intp)0)); std::vector build; // Compute the division of work across dimensions and threads. divide_work(full_space, assignments, build, 0, num_sched-1, dims, 0); return assignments; } } /* num_dim (D) is the number of dimensions of the iteration space. starts is the range-start of each of those dimensions, inclusive. ends is the range-end of each of those dimensions, inclusive. num_threads is the number (N) of chunks to break the iteration space into sched is pre-allocated memory for the schedule to be stored in and is of size NxD. debug is non-zero if DEBUG_ARRAY_OPT is turned on. */ extern "C" void do_scheduling_signed(uintp num_dim, intp *starts, intp *ends, uintp num_threads, intp *sched, intp debug) { if (debug) { printf("do_scheduling_signed\n"); printf("num_dim = %d\n", (int)num_dim); printf("ranges = ("); for (unsigned i = 0; i < num_dim; i++) { printf("[%d, %d], ", (int)starts[i], (int)ends[i]); } printf(")\n"); printf("num_threads = %d\n", (int)num_threads); } if (num_threads == 0) return; RangeActual full_space(num_dim, starts, ends); std::vector ret = create_schedule(full_space, num_threads); flatten_schedule(ret, sched); } extern "C" void do_scheduling_unsigned(uintp num_dim, intp *starts, intp *ends, uintp num_threads, uintp *sched, intp debug) { if (debug) { printf("do_scheduling_unsigned\n"); printf("num_dim = %d\n", (int)num_dim); printf("ranges = ("); for (unsigned i = 0; i < num_dim; i++) { printf("[%d, %d], ", (int)starts[i], (int)ends[i]); } printf(")\n"); printf("num_threads = %d\n", (int)num_threads); } if (num_threads == 0) return; RangeActual full_space(num_dim, starts, ends); std::vector ret = create_schedule(full_space, num_threads); flatten_schedule(ret, sched); }