1 /* Standard C headers */
2 #include <assert.h>
3 #include <stdbool.h>
4 #include <stdint.h>
5 #include <stdlib.h>
6 #include <string.h>
7 
8 #if PTHREADPOOL_USE_CPUINFO
9 	#include <cpuinfo.h>
10 #endif
11 
12 /* Dependencies */
13 #include <fxdiv.h>
14 
15 /* Public library header */
16 #include <pthreadpool.h>
17 
18 /* Internal library headers */
19 #include "threadpool-atomics.h"
20 #include "threadpool-common.h"
21 #include "threadpool-object.h"
22 #include "threadpool-utils.h"
23 
24 
pthreadpool_thread_parallelize_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)25 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath(
26 	struct pthreadpool* threadpool,
27 	struct thread_info* thread)
28 {
29 	assert(threadpool != NULL);
30 	assert(thread != NULL);
31 
32 	const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
33 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
34 
35 	const size_t threads_count = threadpool->threads_count.value;
36 	const size_t range_threshold = -threads_count;
37 
38 	/* Process thread's own range of items */
39 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
40 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
41 		task(argument, range_start++);
42 	}
43 
44 	/* There still may be other threads with work */
45 	const size_t thread_number = thread->thread_number;
46 	for (size_t tid = modulo_decrement(thread_number, threads_count);
47 		tid != thread_number;
48 		tid = modulo_decrement(tid, threads_count))
49 	{
50 		struct thread_info* other_thread = &threadpool->threads[tid];
51 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
52 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
53 			task(argument, index);
54 		}
55 	}
56 
57 	/* Make changes by this thread visible to other threads */
58 	pthreadpool_fence_release();
59 }
60 
pthreadpool_thread_parallelize_1d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)61 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath(
62 	struct pthreadpool* threadpool,
63 	struct thread_info* thread)
64 {
65 	assert(threadpool != NULL);
66 	assert(thread != NULL);
67 
68 	const pthreadpool_task_1d_with_id_t task = (pthreadpool_task_1d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
69 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
70 
71 	const uint32_t default_uarch_index = threadpool->params.parallelize_1d_with_uarch.default_uarch_index;
72 	uint32_t uarch_index = default_uarch_index;
73 	#if PTHREADPOOL_USE_CPUINFO
74 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
75 		if (uarch_index > threadpool->params.parallelize_1d_with_uarch.max_uarch_index) {
76 			uarch_index = default_uarch_index;
77 		}
78 	#endif
79 
80 	const size_t threads_count = threadpool->threads_count.value;
81 	const size_t range_threshold = -threads_count;
82 
83 	/* Process thread's own range of items */
84 	size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
85 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
86 		task(argument, uarch_index, range_start++);
87 	}
88 
89 	/* There still may be other threads with work */
90 	const size_t thread_number = thread->thread_number;
91 	for (size_t tid = modulo_decrement(thread_number, threads_count);
92 		tid != thread_number;
93 		tid = modulo_decrement(tid, threads_count))
94 	{
95 		struct thread_info* other_thread = &threadpool->threads[tid];
96 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
97 			const size_t index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
98 			task(argument, uarch_index, index);
99 		}
100 	}
101 
102 	/* Make changes by this thread visible to other threads */
103 	pthreadpool_fence_release();
104 }
105 
pthreadpool_thread_parallelize_1d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)106 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath(
107 	struct pthreadpool* threadpool,
108 	struct thread_info* thread)
109 {
110 	assert(threadpool != NULL);
111 	assert(thread != NULL);
112 
113 	const pthreadpool_task_1d_tile_1d_t task = (pthreadpool_task_1d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
114 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
115 
116 	const size_t threads_count = threadpool->threads_count.value;
117 	const size_t range_threshold = -threads_count;
118 
119 	/* Process thread's own range of items */
120 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
121 	const size_t tile = threadpool->params.parallelize_1d_tile_1d.tile;
122 	size_t tile_start = range_start * tile;
123 
124 	const size_t range = threadpool->params.parallelize_1d_tile_1d.range;
125 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
126 		task(argument, tile_start, min(range - tile_start, tile));
127 		tile_start += tile;
128 	}
129 
130 	/* There still may be other threads with work */
131 	const size_t thread_number = thread->thread_number;
132 	for (size_t tid = modulo_decrement(thread_number, threads_count);
133 		tid != thread_number;
134 		tid = modulo_decrement(tid, threads_count))
135 	{
136 		struct thread_info* other_thread = &threadpool->threads[tid];
137 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
138 			const size_t tile_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
139 			const size_t tile_start = tile_index * tile;
140 			task(argument, tile_start, min(range - tile_start, tile));
141 		}
142 	}
143 
144 	/* Make changes by this thread visible to other threads */
145 	pthreadpool_fence_release();
146 }
147 
pthreadpool_thread_parallelize_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)148 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath(
149 	struct pthreadpool* threadpool,
150 	struct thread_info* thread)
151 {
152 	assert(threadpool != NULL);
153 	assert(thread != NULL);
154 
155 	const pthreadpool_task_2d_t task = (pthreadpool_task_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
156 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
157 
158 	const size_t threads_count = threadpool->threads_count.value;
159 	const size_t range_threshold = -threads_count;
160 
161 	/* Process thread's own range of items */
162 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
163 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_2d.range_j;
164 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(range_start, range_j);
165 	size_t i = index_i_j.quotient;
166 	size_t j = index_i_j.remainder;
167 
168 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
169 		task(argument, i, j);
170 		if (++j == range_j.value) {
171 			j = 0;
172 			i += 1;
173 		}
174 	}
175 
176 	/* There still may be other threads with work */
177 	const size_t thread_number = thread->thread_number;
178 	for (size_t tid = modulo_decrement(thread_number, threads_count);
179 		tid != thread_number;
180 		tid = modulo_decrement(tid, threads_count))
181 	{
182 		struct thread_info* other_thread = &threadpool->threads[tid];
183 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
184 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
185 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(linear_index, range_j);
186 			task(argument, index_i_j.quotient, index_i_j.remainder);
187 		}
188 	}
189 
190 	/* Make changes by this thread visible to other threads */
191 	pthreadpool_fence_release();
192 }
193 
pthreadpool_thread_parallelize_2d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)194 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath(
195 	struct pthreadpool* threadpool,
196 	struct thread_info* thread)
197 {
198 	assert(threadpool != NULL);
199 	assert(thread != NULL);
200 
201 	const pthreadpool_task_2d_tile_1d_t task = (pthreadpool_task_2d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
202 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
203 
204 	const size_t threads_count = threadpool->threads_count.value;
205 	const size_t range_threshold = -threads_count;
206 
207 	/* Process thread's own range of items */
208 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
209 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_1d.tile_range_j;
210 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
211 	const size_t tile_j = threadpool->params.parallelize_2d_tile_1d.tile_j;
212 	size_t i = tile_index_i_j.quotient;
213 	size_t start_j = tile_index_i_j.remainder * tile_j;
214 
215 	const size_t range_j = threadpool->params.parallelize_2d_tile_1d.range_j;
216 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
217 		task(argument, i, start_j, min(range_j - start_j, tile_j));
218 		start_j += tile_j;
219 		if (start_j >= range_j) {
220 			start_j = 0;
221 			i += 1;
222 		}
223 	}
224 
225 	/* There still may be other threads with work */
226 	const size_t thread_number = thread->thread_number;
227 	for (size_t tid = modulo_decrement(thread_number, threads_count);
228 		tid != thread_number;
229 		tid = modulo_decrement(tid, threads_count))
230 	{
231 		struct thread_info* other_thread = &threadpool->threads[tid];
232 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
233 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
234 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
235 			const size_t start_j = tile_index_i_j.remainder * tile_j;
236 			task(argument, tile_index_i_j.quotient, start_j, min(range_j - start_j, tile_j));
237 		}
238 	}
239 
240 	/* Make changes by this thread visible to other threads */
241 	pthreadpool_fence_release();
242 }
243 
pthreadpool_thread_parallelize_2d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)244 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath(
245 	struct pthreadpool* threadpool,
246 	struct thread_info* thread)
247 {
248 	assert(threadpool != NULL);
249 	assert(thread != NULL);
250 
251 	const pthreadpool_task_2d_tile_2d_t task = (pthreadpool_task_2d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
252 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
253 
254 	const size_t threads_count = threadpool->threads_count.value;
255 	const size_t range_threshold = -threads_count;
256 
257 	/* Process thread's own range of items */
258 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
259 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d.tile_range_j;
260 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(range_start, tile_range_j);
261 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d.tile_i;
262 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d.tile_j;
263 	size_t start_i = tile_index_i_j.quotient * tile_i;
264 	size_t start_j = tile_index_i_j.remainder * tile_j;
265 
266 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d.range_i;
267 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d.range_j;
268 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
269 		task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
270 		start_j += tile_j;
271 		if (start_j >= range_j) {
272 			start_j = 0;
273 			start_i += tile_i;
274 		}
275 	}
276 
277 	/* There still may be other threads with work */
278 	const size_t thread_number = thread->thread_number;
279 	for (size_t tid = modulo_decrement(thread_number, threads_count);
280 		tid != thread_number;
281 		tid = modulo_decrement(tid, threads_count))
282 	{
283 		struct thread_info* other_thread = &threadpool->threads[tid];
284 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
285 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
286 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
287 			const size_t start_i = tile_index_i_j.quotient * tile_i;
288 			const size_t start_j = tile_index_i_j.remainder * tile_j;
289 			task(argument, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
290 		}
291 	}
292 
293 	/* Make changes by this thread visible to other threads */
294 	pthreadpool_fence_release();
295 }
296 
pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)297 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath(
298 	struct pthreadpool* threadpool,
299 	struct thread_info* thread)
300 {
301 	assert(threadpool != NULL);
302 	assert(thread != NULL);
303 
304 	const pthreadpool_task_2d_tile_2d_with_id_t task = (pthreadpool_task_2d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
305 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
306 
307 	const uint32_t default_uarch_index = threadpool->params.parallelize_2d_tile_2d_with_uarch.default_uarch_index;
308 	uint32_t uarch_index = default_uarch_index;
309 	#if PTHREADPOOL_USE_CPUINFO
310 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
311 		if (uarch_index > threadpool->params.parallelize_2d_tile_2d_with_uarch.max_uarch_index) {
312 			uarch_index = default_uarch_index;
313 		}
314 	#endif
315 
316 	const size_t threads_count = threadpool->threads_count.value;
317 	const size_t range_threshold = -threads_count;
318 
319 	/* Process thread's own range of items */
320 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_range_j;
321 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
322 	const struct fxdiv_result_size_t index = fxdiv_divide_size_t(range_start, tile_range_j);
323 	const size_t range_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_i;
324 	const size_t tile_i = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_i;
325 	const size_t range_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.range_j;
326 	const size_t tile_j = threadpool->params.parallelize_2d_tile_2d_with_uarch.tile_j;
327 	size_t start_i = index.quotient * tile_i;
328 	size_t start_j = index.remainder * tile_j;
329 
330 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
331 		task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
332 		start_j += tile_j;
333 		if (start_j >= range_j) {
334 			start_j = 0;
335 			start_i += tile_i;
336 		}
337 	}
338 
339 	/* There still may be other threads with work */
340 	const size_t thread_number = thread->thread_number;
341 	for (size_t tid = modulo_decrement(thread_number, threads_count);
342 		tid != thread_number;
343 		tid = modulo_decrement(tid, threads_count))
344 	{
345 		struct thread_info* other_thread = &threadpool->threads[tid];
346 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
347 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
348 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(linear_index, tile_range_j);
349 			const size_t start_i = tile_index_i_j.quotient * tile_i;
350 			const size_t start_j = tile_index_i_j.remainder * tile_j;
351 			task(argument, uarch_index, start_i, start_j, min(range_i - start_i, tile_i), min(range_j - start_j, tile_j));
352 		}
353 	}
354 
355 	/* Make changes by this thread visible to other threads */
356 	pthreadpool_fence_release();
357 }
358 
pthreadpool_thread_parallelize_3d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)359 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath(
360 	struct pthreadpool* threadpool,
361 	struct thread_info* thread)
362 {
363 	assert(threadpool != NULL);
364 	assert(thread != NULL);
365 
366 	const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
367 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
368 
369 	const size_t threads_count = threadpool->threads_count.value;
370 	const size_t range_threshold = -threads_count;
371 
372 	/* Process thread's own range of items */
373 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
374 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k;
375 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k);
376 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j;
377 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
378 	size_t i = index_i_j.quotient;
379 	size_t j = index_i_j.remainder;
380 	size_t k = index_ij_k.remainder;
381 
382 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
383 		task(argument, i, j, k);
384 		if (++k == range_k.value) {
385 			k = 0;
386 			if (++j == range_j.value) {
387 				j = 0;
388 				i += 1;
389 			}
390 		}
391 	}
392 
393 	/* There still may be other threads with work */
394 	const size_t thread_number = thread->thread_number;
395 	for (size_t tid = modulo_decrement(thread_number, threads_count);
396 		tid != thread_number;
397 		tid = modulo_decrement(tid, threads_count))
398 	{
399 		struct thread_info* other_thread = &threadpool->threads[tid];
400 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
401 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
402 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k);
403 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
404 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder);
405 		}
406 	}
407 
408 	/* Make changes by this thread visible to other threads */
409 	pthreadpool_fence_release();
410 }
411 
pthreadpool_thread_parallelize_3d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)412 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath(
413 	struct pthreadpool* threadpool,
414 	struct thread_info* thread)
415 {
416 	assert(threadpool != NULL);
417 	assert(thread != NULL);
418 
419 	const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
420 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
421 
422 	const size_t threads_count = threadpool->threads_count.value;
423 	const size_t range_threshold = -threads_count;
424 
425 	/* Process thread's own range of items */
426 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
427 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k;
428 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
429 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j;
430 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
431 	const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k;
432 	size_t i = index_i_j.quotient;
433 	size_t j = index_i_j.remainder;
434 	size_t start_k = tile_index_ij_k.remainder * tile_k;
435 
436 	const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k;
437 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
438 		task(argument, i, j, start_k, min(range_k - start_k, tile_k));
439 		start_k += tile_k;
440 		if (start_k >= range_k) {
441 			start_k = 0;
442 			if (++j == range_j.value) {
443 				j = 0;
444 				i += 1;
445 			}
446 		}
447 	}
448 
449 	/* There still may be other threads with work */
450 	const size_t thread_number = thread->thread_number;
451 	for (size_t tid = modulo_decrement(thread_number, threads_count);
452 		tid != thread_number;
453 		tid = modulo_decrement(tid, threads_count))
454 	{
455 		struct thread_info* other_thread = &threadpool->threads[tid];
456 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
457 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
458 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
459 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j);
460 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
461 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k));
462 		}
463 	}
464 
465 	/* Make changes by this thread visible to other threads */
466 	pthreadpool_fence_release();
467 }
468 
pthreadpool_thread_parallelize_3d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)469 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath(
470 	struct pthreadpool* threadpool,
471 	struct thread_info* thread)
472 {
473 	assert(threadpool != NULL);
474 	assert(thread != NULL);
475 
476 	const pthreadpool_task_3d_tile_2d_t task = (pthreadpool_task_3d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
477 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
478 
479 	const size_t threads_count = threadpool->threads_count.value;
480 	const size_t range_threshold = -threads_count;
481 
482 	/* Process thread's own range of items */
483 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
484 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d.tile_range_k;
485 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
486 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d.tile_range_j;
487 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
488 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d.tile_j;
489 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d.tile_k;
490 	size_t i = tile_index_i_j.quotient;
491 	size_t start_j = tile_index_i_j.remainder * tile_j;
492 	size_t start_k = tile_index_ij_k.remainder * tile_k;
493 
494 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d.range_k;
495 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d.range_j;
496 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
497 		task(argument, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
498 		start_k += tile_k;
499 		if (start_k >= range_k) {
500 			start_k = 0;
501 			start_j += tile_j;
502 			if (start_j >= range_j) {
503 				start_j = 0;
504 				i += 1;
505 			}
506 		}
507 	}
508 
509 	/* There still may be other threads with work */
510 	const size_t thread_number = thread->thread_number;
511 	for (size_t tid = modulo_decrement(thread_number, threads_count);
512 		tid != thread_number;
513 		tid = modulo_decrement(tid, threads_count))
514 	{
515 		struct thread_info* other_thread = &threadpool->threads[tid];
516 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
517 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
518 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
519 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
520 			const size_t start_j = tile_index_i_j.remainder * tile_j;
521 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
522 			task(argument, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
523 		}
524 	}
525 
526 	/* Make changes by this thread visible to other threads */
527 	pthreadpool_fence_release();
528 }
529 
pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)530 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath(
531 	struct pthreadpool* threadpool,
532 	struct thread_info* thread)
533 {
534 	assert(threadpool != NULL);
535 	assert(thread != NULL);
536 
537 	const pthreadpool_task_3d_tile_2d_with_id_t task = (pthreadpool_task_3d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
538 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
539 
540 	const uint32_t default_uarch_index = threadpool->params.parallelize_3d_tile_2d_with_uarch.default_uarch_index;
541 	uint32_t uarch_index = default_uarch_index;
542 	#if PTHREADPOOL_USE_CPUINFO
543 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
544 		if (uarch_index > threadpool->params.parallelize_3d_tile_2d_with_uarch.max_uarch_index) {
545 			uarch_index = default_uarch_index;
546 		}
547 	#endif
548 
549 	const size_t threads_count = threadpool->threads_count.value;
550 	const size_t range_threshold = -threads_count;
551 
552 	/* Process thread's own range of items */
553 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
554 	const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_k;
555 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k);
556 	const struct fxdiv_divisor_size_t tile_range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_range_j;
557 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
558 	const size_t tile_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_j;
559 	const size_t tile_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.tile_k;
560 	size_t i = tile_index_i_j.quotient;
561 	size_t start_j = tile_index_i_j.remainder * tile_j;
562 	size_t start_k = tile_index_ij_k.remainder * tile_k;
563 
564 	const size_t range_k = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_k;
565 	const size_t range_j = threadpool->params.parallelize_3d_tile_2d_with_uarch.range_j;
566 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
567 		task(argument, uarch_index, i, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
568 		start_k += tile_k;
569 		if (start_k >= range_k) {
570 			start_k = 0;
571 			start_j += tile_j;
572 			if (start_j >= range_j) {
573 				start_j = 0;
574 				i += 1;
575 			}
576 		}
577 	}
578 
579 	/* There still may be other threads with work */
580 	const size_t thread_number = thread->thread_number;
581 	for (size_t tid = modulo_decrement(thread_number, threads_count);
582 		tid != thread_number;
583 		tid = modulo_decrement(tid, threads_count))
584 	{
585 		struct thread_info* other_thread = &threadpool->threads[tid];
586 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
587 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
588 			const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
589 			const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
590 			const size_t start_j = tile_index_i_j.remainder * tile_j;
591 			const size_t start_k = tile_index_ij_k.remainder * tile_k;
592 			task(argument, uarch_index, tile_index_i_j.quotient, start_j, start_k, min(range_j - start_j, tile_j), min(range_k - start_k, tile_k));
593 		}
594 	}
595 
596 	/* Make changes by this thread visible to other threads */
597 	pthreadpool_fence_release();
598 }
599 
pthreadpool_thread_parallelize_4d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)600 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath(
601 	struct pthreadpool* threadpool,
602 	struct thread_info* thread)
603 {
604 	assert(threadpool != NULL);
605 	assert(thread != NULL);
606 
607 	const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
608 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
609 
610 	const size_t threads_count = threadpool->threads_count.value;
611 	const size_t range_threshold = -threads_count;
612 
613 	/* Process thread's own range of items */
614 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
615 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl;
616 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl);
617 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j;
618 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
619 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l;
620 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
621 	size_t i = index_i_j.quotient;
622 	size_t j = index_i_j.remainder;
623 	size_t k = index_k_l.quotient;
624 	size_t l = index_k_l.remainder;
625 
626 	const size_t range_k = threadpool->params.parallelize_4d.range_k;
627 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
628 		task(argument, i, j, k, l);
629 		if (++l == range_l.value) {
630 			l = 0;
631 			if (++k == range_k) {
632 				k = 0;
633 				if (++j == range_j.value) {
634 					j = 0;
635 					i += 1;
636 				}
637 			}
638 		}
639 	}
640 
641 	/* There still may be other threads with work */
642 	const size_t thread_number = thread->thread_number;
643 	for (size_t tid = modulo_decrement(thread_number, threads_count);
644 		tid != thread_number;
645 		tid = modulo_decrement(tid, threads_count))
646 	{
647 		struct thread_info* other_thread = &threadpool->threads[tid];
648 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
649 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
650 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl);
651 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
652 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
653 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder);
654 		}
655 	}
656 
657 	/* Make changes by this thread visible to other threads */
658 	pthreadpool_fence_release();
659 }
660 
pthreadpool_thread_parallelize_4d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)661 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath(
662 	struct pthreadpool* threadpool,
663 	struct thread_info* thread)
664 {
665 	assert(threadpool != NULL);
666 	assert(thread != NULL);
667 
668 	const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
669 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
670 
671 	const size_t threads_count = threadpool->threads_count.value;
672 	const size_t range_threshold = -threads_count;
673 
674 	/* Process thread's own range of items */
675 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
676 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl;
677 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
678 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j;
679 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
680 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l;
681 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
682 	const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l;
683 	size_t i = index_i_j.quotient;
684 	size_t j = index_i_j.remainder;
685 	size_t k = tile_index_k_l.quotient;
686 	size_t start_l = tile_index_k_l.remainder * tile_l;
687 
688 	const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l;
689 	const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k;
690 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
691 		task(argument, i, j, k, start_l, min(range_l - start_l, tile_l));
692 		start_l += tile_l;
693 		if (start_l >= range_l) {
694 			start_l = 0;
695 			if (++k == range_k) {
696 				k = 0;
697 				if (++j == range_j.value) {
698 					j = 0;
699 					i += 1;
700 				}
701 			}
702 		}
703 	}
704 
705 	/* There still may be other threads with work */
706 	const size_t thread_number = thread->thread_number;
707 	for (size_t tid = modulo_decrement(thread_number, threads_count);
708 		tid != thread_number;
709 		tid = modulo_decrement(tid, threads_count))
710 	{
711 		struct thread_info* other_thread = &threadpool->threads[tid];
712 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
713 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
714 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
715 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
716 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
717 			const size_t start_l = tile_index_k_l.remainder * tile_l;
718 			task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l));
719 		}
720 	}
721 
722 	/* Make changes by this thread visible to other threads */
723 	pthreadpool_fence_release();
724 }
725 
pthreadpool_thread_parallelize_4d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)726 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath(
727 	struct pthreadpool* threadpool,
728 	struct thread_info* thread)
729 {
730 	assert(threadpool != NULL);
731 	assert(thread != NULL);
732 
733 	const pthreadpool_task_4d_tile_2d_t task = (pthreadpool_task_4d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
734 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
735 
736 	const size_t threads_count = threadpool->threads_count.value;
737 	const size_t range_threshold = -threads_count;
738 
739 	/* Process thread's own range of items */
740 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
741 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d.tile_range_kl;
742 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
743 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d.range_j;
744 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
745 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d.tile_range_l;
746 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
747 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d.tile_k;
748 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d.tile_l;
749 	size_t i = index_i_j.quotient;
750 	size_t j = index_i_j.remainder;
751 	size_t start_k = tile_index_k_l.quotient * tile_k;
752 	size_t start_l = tile_index_k_l.remainder * tile_l;
753 
754 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d.range_l;
755 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d.range_k;
756 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
757 		task(argument, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
758 		start_l += tile_l;
759 		if (start_l >= range_l) {
760 			start_l = 0;
761 			start_k += tile_k;
762 			if (start_k >= range_k) {
763 				start_k = 0;
764 				if (++j == range_j.value) {
765 					j = 0;
766 					i += 1;
767 				}
768 			}
769 		}
770 	}
771 
772 	/* There still may be other threads with work */
773 	const size_t thread_number = thread->thread_number;
774 	for (size_t tid = modulo_decrement(thread_number, threads_count);
775 		tid != thread_number;
776 		tid = modulo_decrement(tid, threads_count))
777 	{
778 		struct thread_info* other_thread = &threadpool->threads[tid];
779 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
780 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
781 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
782 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
783 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
784 			const size_t start_k = tile_index_k_l.quotient * tile_k;
785 			const size_t start_l = tile_index_k_l.remainder * tile_l;
786 			task(argument, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
787 		}
788 	}
789 
790 	/* Make changes by this thread visible to other threads */
791 	pthreadpool_fence_release();
792 }
793 
pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)794 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath(
795 	struct pthreadpool* threadpool,
796 	struct thread_info* thread)
797 {
798 	assert(threadpool != NULL);
799 	assert(thread != NULL);
800 
801 	const pthreadpool_task_4d_tile_2d_with_id_t task = (pthreadpool_task_4d_tile_2d_with_id_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
802 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
803 
804 	const uint32_t default_uarch_index = threadpool->params.parallelize_4d_tile_2d_with_uarch.default_uarch_index;
805 	uint32_t uarch_index = default_uarch_index;
806 	#if PTHREADPOOL_USE_CPUINFO
807 		uarch_index = cpuinfo_get_current_uarch_index_with_default(default_uarch_index);
808 		if (uarch_index > threadpool->params.parallelize_4d_tile_2d_with_uarch.max_uarch_index) {
809 			uarch_index = default_uarch_index;
810 		}
811 	#endif
812 
813 	const size_t threads_count = threadpool->threads_count.value;
814 	const size_t range_threshold = -threads_count;
815 
816 	/* Process thread's own range of items */
817 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
818 	const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_kl;
819 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl);
820 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_j;
821 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
822 	const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_range_l;
823 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
824 	const size_t tile_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_k;
825 	const size_t tile_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.tile_l;
826 	size_t i = index_i_j.quotient;
827 	size_t j = index_i_j.remainder;
828 	size_t start_k = tile_index_k_l.quotient * tile_k;
829 	size_t start_l = tile_index_k_l.remainder * tile_l;
830 
831 	const size_t range_l = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_l;
832 	const size_t range_k = threadpool->params.parallelize_4d_tile_2d_with_uarch.range_k;
833 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
834 		task(argument, uarch_index, i, j, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
835 		start_l += tile_l;
836 		if (start_l >= range_l) {
837 			start_l = 0;
838 			start_k += tile_k;
839 			if (start_k >= range_k) {
840 				start_k = 0;
841 				if (++j == range_j.value) {
842 					j = 0;
843 					i += 1;
844 				}
845 			}
846 		}
847 	}
848 
849 	/* There still may be other threads with work */
850 	const size_t thread_number = thread->thread_number;
851 	for (size_t tid = modulo_decrement(thread_number, threads_count);
852 		tid != thread_number;
853 		tid = modulo_decrement(tid, threads_count))
854 	{
855 		struct thread_info* other_thread = &threadpool->threads[tid];
856 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
857 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
858 			const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
859 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j);
860 			const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
861 			const size_t start_k = tile_index_k_l.quotient * tile_k;
862 			const size_t start_l = tile_index_k_l.remainder * tile_l;
863 			task(argument, uarch_index, index_i_j.quotient, index_i_j.remainder, start_k, start_l, min(range_k - start_k, tile_k), min(range_l - start_l, tile_l));
864 		}
865 	}
866 
867 	/* Make changes by this thread visible to other threads */
868 	pthreadpool_fence_release();
869 }
870 
pthreadpool_thread_parallelize_5d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)871 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath(
872 	struct pthreadpool* threadpool,
873 	struct thread_info* thread)
874 {
875 	assert(threadpool != NULL);
876 	assert(thread != NULL);
877 
878 	const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
879 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
880 
881 	const size_t threads_count = threadpool->threads_count.value;
882 	const size_t range_threshold = -threads_count;
883 
884 	/* Process thread's own range of items */
885 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
886 	const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm;
887 	const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm);
888 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k;
889 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
890 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m;
891 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
892 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j;
893 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
894 	size_t i = index_i_j.quotient;
895 	size_t j = index_i_j.remainder;
896 	size_t k = index_ij_k.remainder;
897 	size_t l = index_l_m.quotient;
898 	size_t m = index_l_m.remainder;
899 
900 	const size_t range_l = threadpool->params.parallelize_5d.range_l;
901 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
902 		task(argument, i, j, k, l, m);
903 		if (++m == range_m.value) {
904 			m = 0;
905 			if (++l == range_l) {
906 				l = 0;
907 				if (++k == range_k.value) {
908 					k = 0;
909 					if (++j == range_j.value) {
910 						j = 0;
911 						i += 1;
912 					}
913 				}
914 			}
915 		}
916 	}
917 
918 	/* There still may be other threads with work */
919 	const size_t thread_number = thread->thread_number;
920 	for (size_t tid = modulo_decrement(thread_number, threads_count);
921 		tid != thread_number;
922 		tid = modulo_decrement(tid, threads_count))
923 	{
924 		struct thread_info* other_thread = &threadpool->threads[tid];
925 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
926 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
927 			const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm);
928 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k);
929 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m);
930 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
931 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder);
932 		}
933 	}
934 
935 	/* Make changes by this thread visible to other threads */
936 	pthreadpool_fence_release();
937 }
938 
pthreadpool_thread_parallelize_5d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)939 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath(
940 	struct pthreadpool* threadpool,
941 	struct thread_info* thread)
942 {
943 	assert(threadpool != NULL);
944 	assert(thread != NULL);
945 
946 	const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
947 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
948 
949 	const size_t threads_count = threadpool->threads_count.value;
950 	const size_t range_threshold = -threads_count;
951 
952 	/* Process thread's own range of items */
953 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
954 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m;
955 	const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m);
956 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl;
957 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
958 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j;
959 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
960 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l;
961 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
962 	const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m;
963 	size_t i = index_i_j.quotient;
964 	size_t j = index_i_j.remainder;
965 	size_t k = index_k_l.quotient;
966 	size_t l = index_k_l.remainder;
967 	size_t start_m = tile_index_ijkl_m.remainder * tile_m;
968 
969 	const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m;
970 	const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k;
971 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
972 		task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m));
973 		start_m += tile_m;
974 		if (start_m >= range_m) {
975 			start_m = 0;
976 			if (++l == range_l.value) {
977 				l = 0;
978 				if (++k == range_k) {
979 					k = 0;
980 					if (++j == range_j.value) {
981 						j = 0;
982 						i += 1;
983 					}
984 				}
985 			}
986 		}
987 	}
988 
989 	/* There still may be other threads with work */
990 	const size_t thread_number = thread->thread_number;
991 	for (size_t tid = modulo_decrement(thread_number, threads_count);
992 		tid != thread_number;
993 		tid = modulo_decrement(tid, threads_count))
994 	{
995 		struct thread_info* other_thread = &threadpool->threads[tid];
996 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
997 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
998 			const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m);
999 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl);
1000 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1001 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1002 			size_t start_m = tile_index_ijkl_m.remainder * tile_m;
1003 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m,
1004 				min(range_m - start_m, tile_m));
1005 		}
1006 	}
1007 
1008 	/* Make changes by this thread visible to other threads */
1009 	pthreadpool_fence_release();
1010 }
1011 
pthreadpool_thread_parallelize_5d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1012 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath(
1013 	struct pthreadpool* threadpool,
1014 	struct thread_info* thread)
1015 {
1016 	assert(threadpool != NULL);
1017 	assert(thread != NULL);
1018 
1019 	const pthreadpool_task_5d_tile_2d_t task = (pthreadpool_task_5d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1020 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1021 
1022 	const size_t threads_count = threadpool->threads_count.value;
1023 	const size_t range_threshold = -threads_count;
1024 
1025 	/* Process thread's own range of items */
1026 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1027 	const struct fxdiv_divisor_size_t tile_range_lm = threadpool->params.parallelize_5d_tile_2d.tile_range_lm;
1028 	const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(range_start, tile_range_lm);
1029 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d_tile_2d.range_k;
1030 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
1031 	const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_2d.tile_range_m;
1032 	const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
1033 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_2d.range_j;
1034 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1035 	const size_t tile_l = threadpool->params.parallelize_5d_tile_2d.tile_l;
1036 	const size_t tile_m = threadpool->params.parallelize_5d_tile_2d.tile_m;
1037 	size_t i = index_i_j.quotient;
1038 	size_t j = index_i_j.remainder;
1039 	size_t k = index_ij_k.remainder;
1040 	size_t start_l = tile_index_l_m.quotient * tile_l;
1041 	size_t start_m = tile_index_l_m.remainder * tile_m;
1042 
1043 	const size_t range_m = threadpool->params.parallelize_5d_tile_2d.range_m;
1044 	const size_t range_l = threadpool->params.parallelize_5d_tile_2d.range_l;
1045 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1046 		task(argument, i, j, k, start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
1047 		start_m += tile_m;
1048 		if (start_m >= range_m) {
1049 			start_m = 0;
1050 			start_l += tile_l;
1051 			if (start_l >= range_l) {
1052 				start_l = 0;
1053 				if (++k == range_k.value) {
1054 					k = 0;
1055 					if (++j == range_j.value) {
1056 						j = 0;
1057 						i += 1;
1058 					}
1059 				}
1060 			}
1061 		}
1062 	}
1063 
1064 	/* There still may be other threads with work */
1065 	const size_t thread_number = thread->thread_number;
1066 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1067 		tid != thread_number;
1068 		tid = modulo_decrement(tid, threads_count))
1069 	{
1070 		struct thread_info* other_thread = &threadpool->threads[tid];
1071 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1072 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1073 			const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm);
1074 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k);
1075 			const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m);
1076 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1077 			const size_t start_l = tile_index_l_m.quotient * tile_l;
1078 			const size_t start_m = tile_index_l_m.remainder * tile_m;
1079 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder,
1080 				start_l, start_m, min(range_l - start_l, tile_l), min(range_m - start_m, tile_m));
1081 		}
1082 	}
1083 
1084 	/* Make changes by this thread visible to other threads */
1085 	pthreadpool_fence_release();
1086 }
1087 
pthreadpool_thread_parallelize_6d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1088 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_fastpath(
1089 	struct pthreadpool* threadpool,
1090 	struct thread_info* thread)
1091 {
1092 	assert(threadpool != NULL);
1093 	assert(thread != NULL);
1094 
1095 	const pthreadpool_task_6d_t task = (pthreadpool_task_6d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1096 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1097 
1098 	const size_t threads_count = threadpool->threads_count.value;
1099 	const size_t range_threshold = -threads_count;
1100 
1101 	/* Process thread's own range of items */
1102 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1103 	const struct fxdiv_divisor_size_t range_lmn = threadpool->params.parallelize_6d.range_lmn;
1104 	const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(range_start, range_lmn);
1105 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d.range_k;
1106 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1107 	const struct fxdiv_divisor_size_t range_n = threadpool->params.parallelize_6d.range_n;
1108 	const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1109 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d.range_j;
1110 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1111 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d.range_m;
1112 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1113 	size_t i = index_i_j.quotient;
1114 	size_t j = index_i_j.remainder;
1115 	size_t k = index_ij_k.remainder;
1116 	size_t l = index_l_m.quotient;
1117 	size_t m = index_l_m.remainder;
1118 	size_t n = index_lm_n.remainder;
1119 
1120 	const size_t range_l = threadpool->params.parallelize_6d.range_l;
1121 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1122 		task(argument, i, j, k, l, m, n);
1123 		if (++n == range_n.value) {
1124 			n = 0;
1125 			if (++m == range_m.value) {
1126 				m = 0;
1127 				if (++l == range_l) {
1128 					l = 0;
1129 					if (++k == range_k.value) {
1130 						k = 0;
1131 						if (++j == range_j.value) {
1132 							j = 0;
1133 							i += 1;
1134 						}
1135 					}
1136 				}
1137 			}
1138 		}
1139 	}
1140 
1141 
1142 	/* There still may be other threads with work */
1143 	const size_t thread_number = thread->thread_number;
1144 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1145 		tid != thread_number;
1146 		tid = modulo_decrement(tid, threads_count))
1147 	{
1148 		struct thread_info* other_thread = &threadpool->threads[tid];
1149 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1150 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1151 			const struct fxdiv_result_size_t index_ijk_lmn = fxdiv_divide_size_t(linear_index, range_lmn);
1152 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lmn.quotient, range_k);
1153 			const struct fxdiv_result_size_t index_lm_n = fxdiv_divide_size_t(index_ijk_lmn.remainder, range_n);
1154 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1155 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_lm_n.quotient, range_m);
1156 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder, index_lm_n.remainder);
1157 		}
1158 	}
1159 
1160 	/* Make changes by this thread visible to other threads */
1161 	pthreadpool_fence_release();
1162 }
1163 
pthreadpool_thread_parallelize_6d_tile_1d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1164 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_1d_fastpath(
1165 	struct pthreadpool* threadpool,
1166 	struct thread_info* thread)
1167 {
1168 	assert(threadpool != NULL);
1169 	assert(thread != NULL);
1170 
1171 	const pthreadpool_task_6d_tile_1d_t task = (pthreadpool_task_6d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1172 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1173 
1174 	const size_t threads_count = threadpool->threads_count.value;
1175 	const size_t range_threshold = -threads_count;
1176 
1177 	/* Process thread's own range of items */
1178 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1179 	const struct fxdiv_divisor_size_t tile_range_lmn = threadpool->params.parallelize_6d_tile_1d.tile_range_lmn;
1180 	const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(range_start, tile_range_lmn);
1181 	const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_6d_tile_1d.range_k;
1182 	const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1183 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_1d.tile_range_n;
1184 	const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1185 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_1d.range_j;
1186 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1187 	const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_6d_tile_1d.range_m;
1188 	const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1189 	const size_t tile_n = threadpool->params.parallelize_6d_tile_1d.tile_n;
1190 	size_t i = index_i_j.quotient;
1191 	size_t j = index_i_j.remainder;
1192 	size_t k = index_ij_k.remainder;
1193 	size_t l = index_l_m.quotient;
1194 	size_t m = index_l_m.remainder;
1195 	size_t start_n = tile_index_lm_n.remainder * tile_n;
1196 
1197 	const size_t range_n = threadpool->params.parallelize_6d_tile_1d.range_n;
1198 	const size_t range_l = threadpool->params.parallelize_6d_tile_1d.range_l;
1199 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1200 		task(argument, i, j, k, l, m, start_n, min(range_n - start_n, tile_n));
1201 		start_n += tile_n;
1202 		if (start_n >= range_n) {
1203 			start_n = 0;
1204 			if (++m == range_m.value) {
1205 				m = 0;
1206 				if (++l == range_l) {
1207 					l = 0;
1208 					if (++k == range_k.value) {
1209 						k = 0;
1210 						if (++j == range_j.value) {
1211 							j = 0;
1212 							i += 1;
1213 						}
1214 					}
1215 				}
1216 			}
1217 		}
1218 	}
1219 
1220 
1221 	/* There still may be other threads with work */
1222 	const size_t thread_number = thread->thread_number;
1223 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1224 		tid != thread_number;
1225 		tid = modulo_decrement(tid, threads_count))
1226 	{
1227 		struct thread_info* other_thread = &threadpool->threads[tid];
1228 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1229 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1230 			const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn);
1231 			const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k);
1232 			const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n);
1233 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j);
1234 			const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, range_m);
1235 			const size_t start_n = tile_index_lm_n.remainder * tile_n;
1236 			task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder,
1237 				start_n, min(range_n - start_n, tile_n));
1238 		}
1239 	}
1240 
1241 	/* Make changes by this thread visible to other threads */
1242 	pthreadpool_fence_release();
1243 }
1244 
pthreadpool_thread_parallelize_6d_tile_2d_fastpath(struct pthreadpool * threadpool,struct thread_info * thread)1245 PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath(
1246 	struct pthreadpool* threadpool,
1247 	struct thread_info* thread)
1248 {
1249 	assert(threadpool != NULL);
1250 	assert(thread != NULL);
1251 
1252 	const pthreadpool_task_6d_tile_2d_t task = (pthreadpool_task_6d_tile_2d_t) pthreadpool_load_relaxed_void_p(&threadpool->task);
1253 	void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument);
1254 
1255 	const size_t threads_count = threadpool->threads_count.value;
1256 	const size_t range_threshold = -threads_count;
1257 
1258 	/* Process thread's own range of items */
1259 	const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start);
1260 	const struct fxdiv_divisor_size_t tile_range_mn = threadpool->params.parallelize_6d_tile_2d.tile_range_mn;
1261 	const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(range_start, tile_range_mn);
1262 	const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_6d_tile_2d.range_kl;
1263 	const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1264 	const struct fxdiv_divisor_size_t tile_range_n = threadpool->params.parallelize_6d_tile_2d.tile_range_n;
1265 	const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1266 	const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_6d_tile_2d.range_j;
1267 	const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1268 	const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_6d_tile_2d.range_l;
1269 	const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1270 	const size_t tile_m = threadpool->params.parallelize_6d_tile_2d.tile_m;
1271 	const size_t tile_n = threadpool->params.parallelize_6d_tile_2d.tile_n;
1272 	size_t i = index_i_j.quotient;
1273 	size_t j = index_i_j.remainder;
1274 	size_t k = index_k_l.quotient;
1275 	size_t l = index_k_l.remainder;
1276 	size_t start_m = tile_index_m_n.quotient * tile_m;
1277 	size_t start_n = tile_index_m_n.remainder * tile_n;
1278 
1279 	const size_t range_n = threadpool->params.parallelize_6d_tile_2d.range_n;
1280 	const size_t range_m = threadpool->params.parallelize_6d_tile_2d.range_m;
1281 	const size_t range_k = threadpool->params.parallelize_6d_tile_2d.range_k;
1282 	while (pthreadpool_decrement_fetch_relaxed_size_t(&thread->range_length) < range_threshold) {
1283 		task(argument, i, j, k, l, start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1284 		start_n += tile_n;
1285 		if (start_n >= range_n) {
1286 			start_n = 0;
1287 			start_m += tile_m;
1288 			if (start_m >= range_m) {
1289 				start_m = 0;
1290 				if (++l == range_l.value) {
1291 					l = 0;
1292 					if (++k == range_k) {
1293 						k = 0;
1294 						if (++j == range_j.value) {
1295 							j = 0;
1296 							i += 1;
1297 						}
1298 					}
1299 				}
1300 			}
1301 		}
1302 	}
1303 
1304 	/* There still may be other threads with work */
1305 	const size_t thread_number = thread->thread_number;
1306 	for (size_t tid = modulo_decrement(thread_number, threads_count);
1307 		tid != thread_number;
1308 		tid = modulo_decrement(tid, threads_count))
1309 	{
1310 		struct thread_info* other_thread = &threadpool->threads[tid];
1311 		while (pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_length) < range_threshold) {
1312 			const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end);
1313 			const struct fxdiv_result_size_t tile_index_ijkl_mn = fxdiv_divide_size_t(linear_index, tile_range_mn);
1314 			const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_mn.quotient, range_kl);
1315 			const struct fxdiv_result_size_t tile_index_m_n = fxdiv_divide_size_t(tile_index_ijkl_mn.remainder, tile_range_n);
1316 			const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j);
1317 			const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l);
1318 			const size_t start_m = tile_index_m_n.quotient * tile_m;
1319 			const size_t start_n = tile_index_m_n.remainder * tile_n;
1320 			task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder,
1321 				start_m, start_n, min(range_m - start_m, tile_m), min(range_n - start_n, tile_n));
1322 		}
1323 	}
1324 
1325 	/* Make changes by this thread visible to other threads */
1326 	pthreadpool_fence_release();
1327 }
1328