1 /* Standard C headers */
2 #include <stddef.h>
3 
4 /* Dependencies */
5 #include <fxdiv.h>
6 
7 /* Public library header */
8 #include <pthreadpool.h>
9 
10 /* Internal library headers */
11 #include "threadpool-utils.h"
12 
13 
pthreadpool_compute_1d(pthreadpool_t threadpool,pthreadpool_function_1d_t function,void * argument,size_t range)14 void pthreadpool_compute_1d(
15 	pthreadpool_t threadpool,
16 	pthreadpool_function_1d_t function,
17 	void* argument,
18 	size_t range)
19 {
20 	pthreadpool_parallelize_1d(threadpool,
21 		(pthreadpool_task_1d_t) function, argument,
22 		range, 0 /* flags */);
23 }
24 
pthreadpool_compute_1d_tiled(pthreadpool_t threadpool,pthreadpool_function_1d_tiled_t function,void * argument,size_t range,size_t tile)25 void pthreadpool_compute_1d_tiled(
26 	pthreadpool_t threadpool,
27 	pthreadpool_function_1d_tiled_t function,
28 	void* argument,
29 	size_t range,
30 	size_t tile)
31 {
32 	pthreadpool_parallelize_1d_tile_1d(threadpool,
33 		(pthreadpool_task_1d_tile_1d_t) function, argument,
34 		range, tile, 0 /* flags */);
35 }
36 
pthreadpool_compute_2d(pthreadpool_t threadpool,pthreadpool_function_2d_t function,void * argument,size_t range_i,size_t range_j)37 void pthreadpool_compute_2d(
38 	pthreadpool_t threadpool,
39 	pthreadpool_function_2d_t function,
40 	void* argument,
41 	size_t range_i,
42 	size_t range_j)
43 {
44 	pthreadpool_parallelize_2d(threadpool,
45 		(pthreadpool_task_2d_t) function, argument,
46 		range_i, range_j, 0 /* flags */);
47 }
48 
pthreadpool_compute_2d_tiled(pthreadpool_t threadpool,pthreadpool_function_2d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t tile_i,size_t tile_j)49 void pthreadpool_compute_2d_tiled(
50 	pthreadpool_t threadpool,
51 	pthreadpool_function_2d_tiled_t function,
52 	void* argument,
53 	size_t range_i,
54 	size_t range_j,
55 	size_t tile_i,
56 	size_t tile_j)
57 {
58 	pthreadpool_parallelize_2d_tile_2d(threadpool,
59 		(pthreadpool_task_2d_tile_2d_t) function, argument,
60 		range_i, range_j, tile_i, tile_j, 0 /* flags */);
61 }
62 
63 struct compute_3d_tiled_context {
64 	pthreadpool_function_3d_tiled_t function;
65 	void* argument;
66 	struct fxdiv_divisor_size_t tile_range_j;
67 	struct fxdiv_divisor_size_t tile_range_k;
68 	size_t range_i;
69 	size_t range_j;
70 	size_t range_k;
71 	size_t tile_i;
72 	size_t tile_j;
73 	size_t tile_k;
74 };
75 
compute_3d_tiled(const struct compute_3d_tiled_context * context,size_t linear_index)76 static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) {
77 	const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k;
78 	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
79 	const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
80 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
81 	const size_t max_tile_i = context->tile_i;
82 	const size_t max_tile_j = context->tile_j;
83 	const size_t max_tile_k = context->tile_k;
84 	const size_t index_i = tile_index_i_j.quotient * max_tile_i;
85 	const size_t index_j = tile_index_i_j.remainder * max_tile_j;
86 	const size_t index_k = tile_index_ij_k.remainder * max_tile_k;
87 	const size_t tile_i = min(max_tile_i, context->range_i - index_i);
88 	const size_t tile_j = min(max_tile_j, context->range_j - index_j);
89 	const size_t tile_k = min(max_tile_k, context->range_k - index_k);
90 	context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
91 }
92 
pthreadpool_compute_3d_tiled(pthreadpool_t threadpool,pthreadpool_function_3d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t tile_i,size_t tile_j,size_t tile_k)93 void pthreadpool_compute_3d_tiled(
94 	pthreadpool_t threadpool,
95 	pthreadpool_function_3d_tiled_t function,
96 	void* argument,
97 	size_t range_i,
98 	size_t range_j,
99 	size_t range_k,
100 	size_t tile_i,
101 	size_t tile_j,
102 	size_t tile_k)
103 {
104 	if (pthreadpool_get_threads_count(threadpool) <= 1) {
105 		/* No thread pool used: execute function sequentially on the calling thread */
106 		for (size_t i = 0; i < range_i; i += tile_i) {
107 			for (size_t j = 0; j < range_j; j += tile_j) {
108 				for (size_t k = 0; k < range_k; k += tile_k) {
109 					function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k));
110 				}
111 			}
112 		}
113 	} else {
114 		/* Execute in parallel on the thread pool using linearized index */
115 		const size_t tile_range_i = divide_round_up(range_i, tile_i);
116 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
117 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
118 		struct compute_3d_tiled_context context = {
119 			.function = function,
120 			.argument = argument,
121 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
122 			.tile_range_k = fxdiv_init_size_t(tile_range_k),
123 			.range_i = range_i,
124 			.range_j = range_j,
125 			.range_k = range_k,
126 			.tile_i = tile_i,
127 			.tile_j = tile_j,
128 			.tile_k = tile_k
129 		};
130 		pthreadpool_parallelize_1d(threadpool,
131 			(pthreadpool_task_1d_t) compute_3d_tiled, &context,
132 			tile_range_i * tile_range_j * tile_range_k,
133 			0 /* flags */);
134 	}
135 }
136 
137 struct compute_4d_tiled_context {
138 	pthreadpool_function_4d_tiled_t function;
139 	void* argument;
140 	struct fxdiv_divisor_size_t tile_range_kl;
141 	struct fxdiv_divisor_size_t tile_range_j;
142 	struct fxdiv_divisor_size_t tile_range_l;
143 	size_t range_i;
144 	size_t range_j;
145 	size_t range_k;
146 	size_t range_l;
147 	size_t tile_i;
148 	size_t tile_j;
149 	size_t tile_k;
150 	size_t tile_l;
151 };
152 
compute_4d_tiled(const struct compute_4d_tiled_context * context,size_t linear_index)153 static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) {
154 	const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl;
155 	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
156 	const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
157 	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j);
158 	const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l;
159 	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
160 	const size_t max_tile_i = context->tile_i;
161 	const size_t max_tile_j = context->tile_j;
162 	const size_t max_tile_k = context->tile_k;
163 	const size_t max_tile_l = context->tile_l;
164 	const size_t index_i = tile_index_i_j.quotient * max_tile_i;
165 	const size_t index_j = tile_index_i_j.remainder * max_tile_j;
166 	const size_t index_k = tile_index_k_l.quotient * max_tile_k;
167 	const size_t index_l = tile_index_k_l.remainder * max_tile_l;
168 	const size_t tile_i = min(max_tile_i, context->range_i - index_i);
169 	const size_t tile_j = min(max_tile_j, context->range_j - index_j);
170 	const size_t tile_k = min(max_tile_k, context->range_k - index_k);
171 	const size_t tile_l = min(max_tile_l, context->range_l - index_l);
172 	context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l);
173 }
174 
pthreadpool_compute_4d_tiled(pthreadpool_t threadpool,pthreadpool_function_4d_tiled_t function,void * argument,size_t range_i,size_t range_j,size_t range_k,size_t range_l,size_t tile_i,size_t tile_j,size_t tile_k,size_t tile_l)175 void pthreadpool_compute_4d_tiled(
176 	pthreadpool_t threadpool,
177 	pthreadpool_function_4d_tiled_t function,
178 	void* argument,
179 	size_t range_i,
180 	size_t range_j,
181 	size_t range_k,
182 	size_t range_l,
183 	size_t tile_i,
184 	size_t tile_j,
185 	size_t tile_k,
186 	size_t tile_l)
187 {
188 	if (pthreadpool_get_threads_count(threadpool) <= 1) {
189 		/* No thread pool used: execute function sequentially on the calling thread */
190 		for (size_t i = 0; i < range_i; i += tile_i) {
191 			for (size_t j = 0; j < range_j; j += tile_j) {
192 				for (size_t k = 0; k < range_k; k += tile_k) {
193 					for (size_t l = 0; l < range_l; l += tile_l) {
194 						function(argument, i, j, k, l,
195 							min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l));
196 					}
197 				}
198 			}
199 		}
200 	} else {
201 		/* Execute in parallel on the thread pool using linearized index */
202 		const size_t tile_range_i = divide_round_up(range_i, tile_i);
203 		const size_t tile_range_j = divide_round_up(range_j, tile_j);
204 		const size_t tile_range_k = divide_round_up(range_k, tile_k);
205 		const size_t tile_range_l = divide_round_up(range_l, tile_l);
206 		struct compute_4d_tiled_context context = {
207 			.function = function,
208 			.argument = argument,
209 			.tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l),
210 			.tile_range_j = fxdiv_init_size_t(tile_range_j),
211 			.tile_range_l = fxdiv_init_size_t(tile_range_l),
212 			.range_i = range_i,
213 			.range_j = range_j,
214 			.range_k = range_k,
215 			.range_l = range_l,
216 			.tile_i = tile_i,
217 			.tile_j = tile_j,
218 			.tile_k = tile_k,
219 			.tile_l = tile_l
220 		};
221 		pthreadpool_parallelize_1d(threadpool,
222 			(pthreadpool_task_1d_t) compute_4d_tiled, &context,
223 			tile_range_i * tile_range_j * tile_range_k * tile_range_l,
224 			0 /* flags */);
225 	}
226 }
227