1 // z.lib example code for tile-based threading.
2 //
3 // Example code demonstrates the use of z.lib to scale a single image by
4 // dividing the output into tiles. For processing multiple images, it is
5 // recommended to use frame-based threading for higher efficiency.
6 
7 #include <algorithm>
8 #include <cstddef>
9 #include <cstdint>
10 #include <iostream>
11 #include <memory>
12 #include <mutex>
13 #include <stdexcept>
14 #include <system_error>
15 #include <thread>
16 #include <vector>
17 
18 #include <zimg++.hpp>
19 
20 #include "aligned_malloc.h"
21 #include "argparse.h"
22 #include "win32_bitmap.h"
23 
24 #if ZIMG_API_VERSION < ZIMG_MAKE_API_VERSION(2, 1)
25   #error API 2.1 required
26 #endif
27 
28 namespace {
29 
30 struct Arguments {
31 	const char *inpath;
32 	const char *outpath;
33 	unsigned out_w;
34 	unsigned out_h;
35 	unsigned tile_width;
36 	unsigned tile_height;
37 	unsigned threads;
38 	char interactive;
39 	char opt;
40 };
41 
42 const ArgparseOption program_switches[] = {
43 	{ OPTION_UINT, nullptr, "tile-width",  offsetof(Arguments, tile_width),  nullptr, "tile width" },
44 	{ OPTION_UINT, nullptr, "tile-height", offsetof(Arguments, tile_height), nullptr, "tile height" },
45 	{ OPTION_UINT, nullptr, "threads",     offsetof(Arguments, threads),     nullptr, "number of threads" },
46 	{ OPTION_FLAG, "i",     "interactive", offsetof(Arguments, interactive), nullptr, "interactive mode" },
47 	{ OPTION_NULL },
48 };
49 
50 const ArgparseOption program_positional[] = {
51 	{ OPTION_STRING, nullptr, "inpath",  offsetof(Arguments, inpath),  nullptr, "input path" },
52 	{ OPTION_STRING, nullptr, "outpath", offsetof(Arguments, outpath), nullptr, "output path" },
53 	{ OPTION_UINT,   "w",     "width",   offsetof(Arguments, out_w),   nullptr, "width" },
54 	{ OPTION_UINT,   "h",     "height",  offsetof(Arguments, out_h),   nullptr, "height" },
55 	{ OPTION_NULL },
56 };
57 
58 const ArgparseCommandLine program_def = { program_switches, program_positional, "tile_example", "resize BMP images with tile-based threading" };
59 
60 
61 struct TileTask {
62 	zimgxx::zimage_format src_format;
63 	zimgxx::zimage_format dst_format;
64 	unsigned tile_left;
65 	unsigned tile_top;
66 };
67 
68 struct Callback {
69 	const WindowsBitmap *in_bmp;
70 	WindowsBitmap *out_bmp;
71 	const TileTask *task;
72 	const zimgxx::zimage_buffer *src_buf;
73 	const zimgxx::zimage_buffer *dst_buf;
74 };
75 
allocate_buffer(const zimgxx::zimage_format & format,unsigned count)76 std::pair<zimgxx::zimage_buffer, std::shared_ptr<void>> allocate_buffer(const zimgxx::zimage_format &format, unsigned count)
77 {
78 	zimgxx::zimage_buffer buffer;
79 	std::shared_ptr<void> handle;
80 	unsigned char *ptr;
81 
82 	unsigned mask = zimg_select_buffer_mask(count);
83 	size_t channel_size[3] = { 0 };
84 	size_t pixel_size;
85 
86 	count = (mask == ZIMG_BUFFER_MAX) ? format.height : mask + 1;
87 
88 	if (format.pixel_type == ZIMG_PIXEL_FLOAT)
89 		pixel_size = sizeof(float);
90 	else if (format.pixel_type == ZIMG_PIXEL_WORD || format.pixel_type == ZIMG_PIXEL_HALF)
91 		pixel_size = sizeof(uint16_t);
92 	else
93 		pixel_size = sizeof(uint8_t);
94 
95 	for (unsigned p = 0; p < (format.color_family == ZIMG_COLOR_GREY ? 1U : 3U); ++p) {
96 		unsigned count_plane = p ? count >> format.subsample_h : count;
97 		unsigned mask_plane = (mask == ZIMG_BUFFER_MAX) ? mask : mask >> format.subsample_h;
98 		size_t row_size = format.width * pixel_size;
99 		ptrdiff_t stride = (row_size + 31) & ~31;
100 
101 		buffer.mask(p) = mask_plane;
102 		buffer.stride(p) = stride;
103 		channel_size[p] = static_cast<size_t>(stride) * count_plane;
104 	}
105 
106 	handle.reset(aligned_malloc(channel_size[0] + channel_size[1] + channel_size[2], 32), &aligned_free);
107 	ptr = static_cast<unsigned char *>(handle.get());
108 
109 	for (unsigned p = 0; p < (format.color_family == ZIMG_COLOR_GREY ? 1U : 3U); ++p) {
110 		buffer.data(p) = ptr;
111 		ptr += channel_size[p];
112 	}
113 
114 	return{ buffer, handle };
115 }
116 
allocate_buffer(size_t size)117 std::shared_ptr<void> allocate_buffer(size_t size)
118 {
119 	return{ aligned_malloc(size, 32), &aligned_free };
120 }
121 
unpack_bmp(void * user,unsigned i,unsigned left,unsigned right)122 int unpack_bmp(void *user, unsigned i, unsigned left, unsigned right)
123 {
124 	Callback *cb = static_cast<Callback *>(user);
125 
126 	// Pixel indices in the input image are relative to the whole image.
127 	const uint8_t *packed_bgr = cb->in_bmp->read_ptr() + static_cast<ptrdiff_t>(i) * cb->in_bmp->stride();
128 	uint8_t *planar_r = static_cast<uint8_t *>(cb->src_buf->line_at(i, 0));
129 	uint8_t *planar_g = static_cast<uint8_t *>(cb->src_buf->line_at(i, 1));
130 	uint8_t *planar_b = static_cast<uint8_t *>(cb->src_buf->line_at(i, 2));
131 	unsigned step = cb->in_bmp->bit_count() / 8;
132 
133 	for (unsigned j = left; j < right; ++j) {
134 		uint8_t r, g, b;
135 
136 		b = packed_bgr[j * step + 0];
137 		g = packed_bgr[j * step + 1];
138 		r = packed_bgr[j * step + 2];
139 
140 		planar_r[j] = r;
141 		planar_g[j] = g;
142 		planar_b[j] = b;
143 	}
144 
145 	return 0;
146 }
147 
pack_bmp(void * user,unsigned i,unsigned left,unsigned right)148 int pack_bmp(void *user, unsigned i, unsigned left, unsigned right)
149 {
150 	Callback *cb = static_cast<Callback *>(user);
151 
152 	// Pixel indices in the output image are relative to the tile.
153 	const uint8_t *planar_r = static_cast<const uint8_t *>(cb->dst_buf->line_at(i, 0));
154 	const uint8_t *planar_g = static_cast<const uint8_t *>(cb->dst_buf->line_at(i, 1));
155 	const uint8_t *planar_b = static_cast<const uint8_t *>(cb->dst_buf->line_at(i, 2));
156 	uint8_t *packed_bgr = cb->out_bmp->write_ptr() + static_cast<ptrdiff_t>(i + cb->task->tile_top) * cb->out_bmp->stride() + (cb->task->tile_left * cb->out_bmp->bit_count() / 8);
157 	unsigned step = cb->out_bmp->bit_count() / 8;
158 
159 	for (unsigned j = left; j < right; ++j) {
160 		uint8_t r, g, b;
161 
162 		r = planar_r[j];
163 		g = planar_g[j];
164 		b = planar_b[j];
165 
166 		packed_bgr[j * step + 0] = b;
167 		packed_bgr[j * step + 1] = g;
168 		packed_bgr[j * step + 2] = r;
169 	}
170 
171 	return 0;
172 }
173 
174 void thread_func(const WindowsBitmap *in_bmp, WindowsBitmap *out_bmp, std::vector<TileTask> *tasks, std::mutex *mutex, std::exception_ptr *eptr, bool interactive);
175 
execute(const Arguments & args)176 void execute(const Arguments &args)
177 {
178 	WindowsBitmap in_bmp{ args.inpath, WindowsBitmap::READ_TAG };
179 	WindowsBitmap out_bmp{ args.outpath, static_cast<int>(args.out_w), static_cast<int>(args.out_h), 24 };
180 
181 	// (1) Fill the common fields in the format descriptors for the input and output files.
182 	zimgxx::zimage_format in_format;
183 	zimgxx::zimage_format out_format;
184 
185 	in_format.width = in_bmp.width();
186 	in_format.height = in_bmp.height();
187 	in_format.pixel_type = ZIMG_PIXEL_BYTE;
188 	in_format.color_family = ZIMG_COLOR_RGB;
189 	in_format.pixel_range = ZIMG_RANGE_FULL;
190 
191 	out_format.pixel_type = ZIMG_PIXEL_BYTE;
192 	out_format.color_family = ZIMG_COLOR_RGB;
193 	out_format.pixel_range = ZIMG_RANGE_FULL;
194 
195 	std::vector<TileTask> task_queue;
196 
197 	// (2) Calculate the bounds of the input regions from the output regions. For
198 	// each tile, a graph creates an image of tile_width x tile_height from a
199 	// subset of the input image. Note that the input tile is specified through
200 	// the active_region field, unlike the output tile.
201 	double scale_w = static_cast<double>(in_bmp.width()) / args.out_w;
202 	double scale_h = static_cast<double>(in_bmp.height()) / args.out_h;
203 
204 	// The destination buffer passed to zimg_filter_graph_process will point to
205 	// the upper-left corner of the tile. As a result, when not using a pack
206 	// callback, incrementing the output image by tile_width pixels must
207 	// maintain alignment.
208 	if (args.tile_width % 32)
209 		std::cout << "warning: tile width results in unaligned image\n";
210 
211 	for (unsigned i = 0; i < args.out_h; i += args.tile_height) {
212 		for (unsigned j = 0; j < args.out_w; j += args.tile_width) {
213 			zimgxx::zimage_format tile_in_format = in_format;
214 			zimgxx::zimage_format tile_out_format = out_format;
215 
216 			unsigned tile_right = out_bmp.width() - j >= args.tile_width ? j + args.tile_width : out_bmp.width();
217 			unsigned tile_bottom = out_bmp.height() - i >= args.tile_height ? i + args.tile_height : out_bmp.height();
218 
219 			tile_in_format.active_region.left = j * scale_w;
220 			tile_in_format.active_region.top = i * scale_h;
221 			tile_in_format.active_region.width = (tile_right - j) * scale_w;
222 			tile_in_format.active_region.height = (tile_bottom - i) * scale_h;
223 
224 			tile_out_format.width = tile_right - j;
225 			tile_out_format.height = tile_bottom - i;
226 
227 			task_queue.push_back({ tile_in_format, tile_out_format, j, i });
228 		}
229 	}
230 
231 	// (3) Distribute the tiles across threads. Note that the calls to
232 	// zimg_filter_graph_create must also be parallelized for maximum effect.
233 	std::vector<std::thread> threads;
234 	unsigned num_threads = args.interactive ? 1 : (args.threads ? args.threads : std::thread::hardware_concurrency());
235 	std::exception_ptr eptr;
236 	std::mutex mutex;
237 
238 	// Process tiles in raster order.
239 	std::reverse(task_queue.begin(), task_queue.end());
240 
241 	threads.reserve(num_threads);
242 	for (unsigned i = 0; i < num_threads; ++i) {
243 		threads.emplace_back(thread_func, &in_bmp, &out_bmp, &task_queue, &mutex, &eptr, !!args.interactive);
244 	}
245 
246 	for (std::thread &th : threads) {
247 		th.join();
248 	}
249 
250 	if (eptr)
251 		std::rethrow_exception(eptr);
252 }
253 
thread_func(const WindowsBitmap * in_bmp,WindowsBitmap * out_bmp,std::vector<TileTask> * tasks,std::mutex * mutex,std::exception_ptr * eptr,bool interactive)254 void thread_func(const WindowsBitmap *in_bmp, WindowsBitmap *out_bmp, std::vector<TileTask> *tasks, std::mutex *mutex, std::exception_ptr *eptr, bool interactive)
255 {
256 	try {
257 		while (true) {
258 			std::unique_lock<std::mutex> lock{ *mutex };
259 			if (tasks->empty())
260 				break;
261 
262 			TileTask task = tasks->back();
263 			tasks->pop_back();
264 			lock.unlock();
265 
266 			// (4) Build the processing context for the tile.
267 			zimgxx::FilterGraph graph{ zimgxx::FilterGraph::build(task.src_format, task.dst_format) };
268 			unsigned input_buffering = graph.get_input_buffering();
269 			unsigned output_buffering = graph.get_output_buffering();
270 			size_t tmp_size = graph.get_tmp_size();
271 
272 			if (input_buffering == ZIMG_BUFFER_MAX || output_buffering == ZIMG_BUFFER_MAX)
273 				throw std::logic_error{ "graph can not be processed with tiles" };
274 
275 			// (5) Allocate scanline buffers for the input and output data.
276 			auto src_buf = allocate_buffer(task.src_format, input_buffering);
277 			auto dst_buf = allocate_buffer(task.dst_format, output_buffering);
278 			auto tmp = allocate_buffer(tmp_size);
279 
280 			// (6) Process the tile.
281 			Callback cb{ in_bmp, out_bmp, &task, &src_buf.first, &dst_buf.first };
282 			graph.process(src_buf.first.as_const(), dst_buf.first, tmp.get(), unpack_bmp, &cb, pack_bmp, &cb);
283 
284 			if (interactive) {
285 				out_bmp->flush();
286 				std::cout << "Press enter to continue...";
287 				std::cin.get();
288 			}
289 		}
290 	} catch (...) {
291 		std::lock_guard<std::mutex> lock{ *mutex };
292 		*eptr = std::current_exception();
293 	}
294 }
295 
296 } // namespace
297 
298 
main(int argc,char ** argv)299 int main(int argc, char **argv)
300 {
301 	Arguments args{};
302 	int ret;
303 
304 	args.tile_width = 512;
305 	args.tile_height = 512;
306 
307 	if ((ret = argparse_parse(&program_def, &args, argc, argv)) < 0)
308 		return ret == ARGPARSE_HELP_MESSAGE ? 0 : ret;
309 
310 	if (zimg_get_api_version(nullptr, nullptr) < ZIMG_MAKE_API_VERSION(2, 1)) {
311 		std::cerr << "error: subpixel operation requires API 2.1\n";
312 		return 2;
313 	}
314 
315 	// Prior to z.lib 2.9, using horizontal tiling results in redundant loading
316 	// of scanlines above the first row in the tile.
317 	if (args.tile_height < args.out_h) {
318 		unsigned version[3];
319 		zimg_get_version_info(version, version + 1, version + 2);
320 
321 		if (version[0] < 2 || (version[0] == 2 && version[1] < 9))
322 			std::cerr << "warning: horizontal tiling may be slow in z.lib versions prior to 2.9\n";
323 	}
324 
325 	try {
326 		execute(args);
327 	} catch (const std::system_error &e) {
328 		std::cerr << "system_error " << e.code() << ": " << e.what() << '\n';
329 		return 2;
330 	} catch (const zimgxx::zerror &e) {
331 		std::cerr << "zimg error " << e.code << ": " << e.msg << '\n';
332 		return 2;
333 	} catch (const std::runtime_error &e) {
334 		std::cerr << "runtime_error: " << e.what() << '\n';
335 		return 2;
336 	} catch (const std::logic_error &e) {
337 		std::cerr << "logic_error: " << e.what() << '\n';
338 		return 2;
339 	}
340 
341 	return 0;
342 }
343