1 /*
2 * The MIT License (MIT)
3 * This file is part of waifu2x-converter-cpp
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to deal
7 * in the Software without restriction, including without limitation the rights
8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in all
13 * copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23 
24 #include "modelHandler.hpp"
25 // #include <iostream> in modelHandler.hpp
26 #include "cvwrap.hpp"
27 #include <fstream>
28 #include <thread>
29 #include <atomic>
30 #include "sec.hpp"
31 //#include "threadPool.hpp"
32 #include "common.hpp"
33 #include "filters.hpp"
34 #include "params.h"
35 
36 namespace w2xc
37 {
38 
getNInputPlanes()39 	int Model::getNInputPlanes()
40 	{
41 		return nInputPlanes;
42 	}
43 
getNOutputPlanes()44 	int Model::getNOutputPlanes()
45 	{
46 		return nOutputPlanes;
47 	}
48 
filter_CV(ComputeEnv * env,Buffer * packed_input_buf,Buffer * packed_output_buf,const W2Size & size)49 	bool Model::filter_CV
50 	(
51 		ComputeEnv *env,
52 		Buffer *packed_input_buf,
53 		Buffer *packed_output_buf,
54 		const W2Size &size
55 	)
56 	{
57 		size_t in_size = sizeof(float) * size.width * size.height * nInputPlanes;
58 		const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size);
59 		float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env);
60 
61 //FutureNote: Should this not just be commented out? Or removed ? (ther's git history if we ever need it again)
62 #if 0 // HAVE_OPENCV
63 		std::vector<cv::Mat> outputPlanes;
64 		std::vector<cv::Mat> inputPlanes;
65 
66 		for (int i = 0; i < nInputPlanes; i++)
67 		{
68 			inputPlanes.push_back(cv::Mat::zeros(cvSize_from_w2(size), CV_32FC1));
69 		}
70 
71 		std::vector<W2Mat> inputPlanes_2(extract_viewlist_from_cvmat(inputPlanes));
72 		unpack_mat(inputPlanes_2, packed_input, size.width, size.height, nInputPlanes);
73 
74 		outputPlanes.clear();
75 
76 		for (int i = 0; i < nOutputPlanes; i++)
77 		{
78 			outputPlanes.push_back(cv::Mat::zeros(cvSize_from_w2(size), CV_32FC1));
79 		}
80 
81 		// filter job issuing
82 		std::vector<std::thread> workerThreads;
83 		std::vector<W2Mat> inputPlanes_w2 = extract_viewlist_from_cvmat(inputPlanes);
84 		std::vector<W2Mat> outputPlanes_w2 = extract_viewlist_from_cvmat(outputPlanes);
85 
86 		int worksPerThread = nOutputPlanes / nJob;
87 		int nJob = modelUtility::getInstance().getNumberOfJobs();
88 
89 		for (int idx = 0; idx < nJob; idx++)
90 		{
91 			if (!(idx == (nJob - 1) && worksPerThread * nJob != nOutputPlanes))
92 			{
93 				workerThreads.push_back
94 				(
95 					std::thread
96 					(
97 						&Model::filterWorker, this,
98 						std::ref(inputPlanes_w2), std::ref(weights),
99 						std::ref(outputPlanes_w2),
100 						static_cast<unsigned int>(worksPerThread * idx),
101 						static_cast<unsigned int>(worksPerThread)
102 					)
103 				);
104 			}
105 			else
106 			{
107 				// worksPerThread * nJob != nOutputPlanes
108 				workerThreads.push_back(
109 						std::thread(
110 							&Model::filterWorker, this,
111 							std::ref(inputPlanes_w2), std::ref(weights),
112 							std::ref(outputPlanes_w2),
113 							static_cast<unsigned int>(worksPerThread * idx),
114 							static_cast<unsigned int>(nOutputPlanes - worksPerThread * idx)
115 						)
116 				);
117 			}
118 		}
119 
120 		// wait for finishing jobs
121 		for (auto& th : workerThreads)
122 		{
123 			th.join();
124 		}
125 
126 		std::vector<W2Mat> outputPlanes_2(extract_viewlist_from_cvmat(outputPlanes));
127 		pack_mat(packed_output, outputPlanes_2, size.width, size.height, nOutputPlanes);
128 
129 		return true;
130 #else
131 		std::atomic<int> yi_shared(0);
132 
133 		auto thread_func = [&]()
134 		{
135 			int w = size.width;
136 			int h = size.height;
137 
138 			while (true)
139 			{
140 				int yi = yi_shared++;
141 
142 				if (yi >= h)
143 				{
144 					break;
145 				}
146 
147 				float *out_line = packed_output + w*nOutputPlanes * yi;
148 
149 				int yi0 = yi-1;
150 				int yi1 = yi;
151 				int yi2 = yi+1;
152 
153 				if (yi == 0)
154 				{
155 					yi0 = 0;
156 				}
157 
158 				if (yi == h-1)
159 				{
160 					yi2 = yi1;
161 				}
162 
163 				const float *in_line0 = packed_input + w * nInputPlanes * yi0;
164 				const float *in_line1 = packed_input + w * nInputPlanes * yi1;
165 				const float *in_line2 = packed_input + w * nInputPlanes * yi2;
166 
167 				for (int xi=0; xi<w; xi++)
168 				{
169 					int x0 = xi-1;
170 					int x1 = xi;
171 					int x2 = xi+1;
172 
173 					if (xi == 0)
174 					{
175 						x0 = 0;
176 					}
177 
178 					if (xi == w-1)
179 					{
180 						x2 = x1;
181 					}
182 
183 					const float *in00 = in_line0 + x0 * nInputPlanes;
184 					const float *in01 = in_line0 + x1 * nInputPlanes;
185 					const float *in02 = in_line0 + x2 * nInputPlanes;
186 
187 					const float *in10 = in_line1 + x0 * nInputPlanes;
188 					const float *in11 = in_line1 + x1 * nInputPlanes;
189 					const float *in12 = in_line1 + x2 * nInputPlanes;
190 
191 					const float *in20 = in_line2 + x0 * nInputPlanes;
192 					const float *in21 = in_line2 + x1 * nInputPlanes;
193 					const float *in22 = in_line2 + x2 * nInputPlanes;
194 
195 					for (int oi=0; oi<nOutputPlanes; oi++)
196 					{
197 						float sum = 0;
198 
199 						for (int ii=0; ii<nInputPlanes; ii++)
200 						{
201 							int wMatIndex = nInputPlanes * oi + ii;
202 							const float *w = weights[wMatIndex].ptr<float>(0);
203 
204 							sum += in00[ii] * w[0];
205 							sum += in01[ii] * w[1];
206 							sum += in02[ii] * w[2];
207 
208 							sum += in10[ii] * w[3];
209 							sum += in11[ii] * w[4];
210 							sum += in12[ii] * w[5];
211 
212 							sum += in20[ii] * w[6];
213 							sum += in21[ii] * w[7];
214 							sum += in22[ii] * w[8];
215 						}
216 
217 						float v = sum;
218 						v += (float) biases[oi];
219 						float mtz = (std::max)(v, 0.0f);
220 						float ltz = (std::min)(v, 0.0f);
221 						v = ltz*0.1f + mtz;
222 
223 						out_line[xi*nOutputPlanes + oi] = v;
224 					}
225 				}
226 			}
227 		};
228 
229 		int w = size.width;
230 		int h = size.height;
231 		std::vector<std::thread> workerThreads;
232 		int nJob = modelUtility::getInstance().getNumberOfJobs();
233 
234 		for (int ji=0; ji<nJob; ji++)
235 		{
236 			workerThreads.emplace_back(std::thread(thread_func));
237 		}
238 
239 		for (auto&th : workerThreads)
240 		{
241 			th.join();
242 		}
243 #endif
244 		return true;
245 	}
246 
247 //#define COMPARE_RESULT
filter_AVX_OpenCL(W2XConv * conv,ComputeEnv * env,Buffer * packed_input_buf,Buffer * packed_output_buf,const W2Size & size)248 	bool Model::filter_AVX_OpenCL
249 	(
250 		W2XConv *conv,
251 		ComputeEnv *env,
252 		Buffer *packed_input_buf,
253 		Buffer *packed_output_buf,
254 		const W2Size &size
255 	)
256 	{
257 		int vec_width;
258 		int weight_step;
259 		int nJob = modelUtility::getInstance().getNumberOfJobs();
260 		const struct W2XConvProcessor *proc = conv->target_processor;
261 
262 		bool gpu = (proc->type == W2XCONV_PROC_OPENCL) || (proc->type == W2XCONV_PROC_CUDA);
263 
264 		if (gpu)
265 		{
266 			weight_step = GPU_VEC_WIDTH;
267 			vec_width = GPU_VEC_WIDTH;
268 		}
269 		else
270 		{
271 			weight_step = nOutputPlanes;
272 			vec_width = VEC_WIDTH;
273 		}
274 
275 		float *weight_flat = (float*)w2xc_aligned_malloc(sizeof(float)*nInputPlanes*weight_step*3*3, 64);
276 		float *fbiases_flat = (float*)w2xc_aligned_malloc(sizeof(float) * biases.size(), 64);
277 
278 		for (int i=0; i<(int)biases.size(); i++)
279 		{
280 			fbiases_flat[i] = (float) biases[i];
281 		}
282 
283 		if (nOutputPlanes == 1)
284 		{
285 			if (gpu)
286 			{
287 				for (int ii=0; ii<nInputPlanes; ii++)
288 				{
289 					W2Mat &wm = weights[ii];
290 					const float *src0 = wm.ptr<float>(0);
291 					const float *src1 = wm.ptr<float>(1);
292 					const float *src2 = wm.ptr<float>(2);
293 
294 					float *dst = weight_flat + ii * 9;
295 					dst[0] = src0[0];
296 					dst[1] = src0[1];
297 					dst[2] = src0[2];
298 
299 					dst[3] = src1[0];
300 					dst[4] = src1[1];
301 					dst[5] = src1[2];
302 
303 					dst[6] = src2[0];
304 					dst[7] = src2[1];
305 					dst[8] = src2[2];
306 
307 				}
308 			}
309 			else
310 			{
311 				for (int ii=0; ii<nInputPlanes; ii++)
312 				{
313 					W2Mat &wm = weights[ii];
314 					const float *src0 = wm.ptr<float>(0);
315 					const float *src1 = wm.ptr<float>(1);
316 					const float *src2 = wm.ptr<float>(2);
317 
318 					int ii_0 = ii % vec_width;
319 					int ii_1 = (ii / vec_width) * vec_width;
320 
321 					float *dst = weight_flat + ii_1 * 9  + ii_0;
322 					dst[0 * vec_width] = src0[0];
323 					dst[1 * vec_width] = src0[1];
324 					dst[2 * vec_width] = src0[2];
325 
326 					dst[3 * vec_width] = src1[0];
327 					dst[4 * vec_width] = src1[1];
328 					dst[5 * vec_width] = src1[2];
329 
330 					dst[6 * vec_width] = src2[0];
331 					dst[7 * vec_width] = src2[1];
332 					dst[8 * vec_width] = src2[2];
333 				}
334 			}
335 		}
336 		else if (gpu && nInputPlanes == 1)
337 		{
338 			for (int oi=0; oi<nOutputPlanes; oi++)
339 			{
340 				W2Mat &wm = weights[oi];
341 				const float *src0 = wm.ptr<float>(0);
342 				const float *src1 = wm.ptr<float>(1);
343 				const float *src2 = wm.ptr<float>(2);
344 
345 				float *dst = weight_flat + oi * 9;
346 				dst[0] = src0[0];
347 				dst[1] = src0[1];
348 				dst[2] = src0[2];
349 
350 				dst[3] = src1[0];
351 				dst[4] = src1[1];
352 				dst[5] = src1[2];
353 
354 				dst[6] = src2[0];
355 				dst[7] = src2[1];
356 				dst[8] = src2[2];
357 			}
358 		}
359 		else if (nOutputPlanes == 3)
360 		{
361 			/* |       o0        |       o1        | o2 ... |
362 			 * |i0 i1 i2 ... i127|i0 i1 i2 ... i127| ...    |*/
363 			for (int oi=0; oi<nOutputPlanes; oi++)
364 			{
365 				for (int ii=0; ii<nInputPlanes; ii++)
366 				{
367 					int mi = oi*nInputPlanes+ii;
368 					W2Mat &wm = weights[mi];
369 					const float *src0 = wm.ptr<float>(0);
370 					const float *src1 = wm.ptr<float>(1);
371 					const float *src2 = wm.ptr<float>(2);
372 
373 					float *dst = weight_flat + (oi * nInputPlanes * 9) + ii;
374 					dst[0*nInputPlanes] = src0[0];
375 					dst[1*nInputPlanes] = src0[1];
376 					dst[2*nInputPlanes] = src0[2];
377 
378 					dst[3*nInputPlanes] = src1[0];
379 					dst[4*nInputPlanes] = src1[1];
380 					dst[5*nInputPlanes] = src1[2];
381 
382 					dst[6*nInputPlanes] = src2[0];
383 					dst[7*nInputPlanes] = src2[1];
384 					dst[8*nInputPlanes] = src2[2];
385 				}
386 			}
387 		}
388 		else if (gpu && (nInputPlanes == 3) && (nOutputPlanes == 32))
389 		{
390 			/* | i0             | i1        | i2 .. iN-1|
391 			 * |o0 o1 o2 o3..o31|o0 .... o32| ....      |
392 			 * |<-            ->|
393 			 * |    32          |
394 			 * |   x  9         |
395 			 */
396 
397 			for (int oi=0; oi<nOutputPlanes; oi++)
398 			{
399 				for (int ii=0; ii<nInputPlanes; ii++)
400 				{
401 					int mi = oi*nInputPlanes+ii;
402 					W2Mat &wm = weights[mi];
403 					const float *src0 = wm.ptr<float>(0);
404 					const float *src1 = wm.ptr<float>(1);
405 					const float *src2 = wm.ptr<float>(2);
406 
407 					float *dst = weight_flat + (ii * nOutputPlanes * 9) + oi;
408 					dst[0*nOutputPlanes] = src0[0];
409 					dst[1*nOutputPlanes] = src0[1];
410 					dst[2*nOutputPlanes] = src0[2];
411 
412 					dst[3*nOutputPlanes] = src1[0];
413 					dst[4*nOutputPlanes] = src1[1];
414 					dst[5*nOutputPlanes] = src1[2];
415 
416 					dst[6*nOutputPlanes] = src2[0];
417 					dst[7*nOutputPlanes] = src2[1];
418 					dst[8*nOutputPlanes] = src2[2];
419 				}
420 			}
421 		}
422 		else
423 		{
424 			bool simd_oplane = false;
425 			bool simd_iplane = false;
426 			int simd_vec_width = 0;
427 
428 			if (proc->type == W2XCONV_PROC_HOST)
429 			{
430 				switch (proc->sub_type)
431 				{
432 					case W2XCONV_PROC_HOST_SSE3:
433 					{
434 						simd_vec_width = 4;
435 						simd_oplane = true;
436 						break;
437 					}
438 					case W2XCONV_PROC_HOST_NEON:
439 					{
440 						simd_vec_width = 4;
441 						simd_oplane = true;
442 						break;
443 					}
444 					case W2XCONV_PROC_HOST_ALTIVEC:
445 					{
446 						simd_vec_width = 8;
447 						simd_oplane = true;
448 						break;
449 					}
450 					case W2XCONV_PROC_HOST_AVX:
451 					case W2XCONV_PROC_HOST_FMA:
452 					{
453 						simd_vec_width = 8;
454 						simd_oplane = true;
455 						break;
456 					}
457 				}
458 			}
459 
460 			simd_oplane = simd_oplane && (nInputPlanes%(simd_vec_width*4) == 0) && (nOutputPlanes%(simd_vec_width*2) == 0);
461 			simd_iplane = simd_iplane && (nInputPlanes%(simd_vec_width*4) == 0) && (nOutputPlanes%(simd_vec_width*2) == 0);
462 
463 			if (simd_oplane || simd_iplane)
464 			{
465 				/*
466 				 * weight_chunk (16x32x3x4 = 6144[Byte])
467 				 * (where op_block_size=16, ip_block_size=32)
468 				 *
469 				 * 111                                            oplane x16
470 				 * 16 16 .. (x16)  ..16                           iplane x32
471 				 *            \               |               /   horiz  x3
472 				 *                                                oplane xnOutputPlane_block
473 				 *                                                iplane xnInputPlane_block
474 				 *                                                vert   x3
475 				 */
476 				int ip_block_size;
477 				int op_block_size;
478 
479 				if (simd_oplane)
480 				{
481 					ip_block_size = (simd_vec_width*4);
482 					op_block_size = (simd_vec_width*2);
483 				}
484 				else {
485 					ip_block_size = (simd_vec_width*2);
486 					op_block_size = (simd_vec_width*4);
487 				}
488 
489 				int nInputPlane_block = nInputPlanes/ip_block_size;
490 				int nOutputPlane_block = nOutputPlanes/op_block_size;
491 
492 				float *dst = weight_flat;
493 
494 				for (int dposy=0; dposy<3; dposy++)
495 				{
496 					for (int ii0=0; ii0<nInputPlane_block; ii0++)
497 					{
498 						for (int oi0=0; oi0<nOutputPlane_block; oi0++)
499 						{
500 							for (int dposx=0; dposx<3; dposx++)
501 							{
502 								if (simd_oplane)
503 								{
504 									for (int ii1=0; ii1<ip_block_size; ii1++)
505 									{
506 										for (int oi1=0; oi1<op_block_size; oi1++)
507 										{
508 											int ii = ii0*ip_block_size + ii1;
509 											int oi = oi0*op_block_size + oi1;
510 											int mi = oi*nInputPlanes + ii;
511 
512 											W2Mat &wm = weights[mi];
513 											float &src = wm.at<float>(dposy, dposx);
514 											*dst = src;
515 
516 											dst++;
517 										}
518 									}
519 								}
520 								else
521 								{
522 									for (int oi1=0; oi1<op_block_size; oi1++)
523 									{
524 										for (int ii1=0; ii1<ip_block_size; ii1++)
525 										{
526 											int ii = ii0*ip_block_size + ii1;
527 											int oi = oi0*op_block_size + oi1;
528 											int mi = oi*nInputPlanes + ii;
529 
530 											W2Mat &wm = weights[mi];
531 											float &src = wm.at<float>(dposy, dposx);
532 											*dst = src;
533 
534 											dst++;
535 										}
536 									}
537 								}
538 							}
539 						}
540 					}
541 				}
542 			}
543 			else
544 			{
545 				/* | i0        | i1        | i2 .. iN-1|   i0      | i1        | ..
546 				 * |o0 o1 o2 o3|o0 o1 o2 o3| ....      |o4 o5 o6 o7|o4 o5 o6 o7| ..
547 				 * |<-       ->|
548 				 * | VEC_WIDTH |
549 				 * |   x  9    |
550 				 */
551 				for (int oi=0; oi<nOutputPlanes; oi++)
552 				{
553 					for (int ii=0; ii<nInputPlanes; ii++)
554 					{
555 						int mi = oi*nInputPlanes+ii;
556 						W2Mat &wm = weights[mi];
557 						const float *src0 = wm.ptr<float>(0);
558 						const float *src1 = wm.ptr<float>(1);
559 						const float *src2 = wm.ptr<float>(2);
560 
561 						int oi_0 = oi % vec_width;
562 						int oi_1 = (oi / vec_width) * vec_width;
563 
564 						float *dst = weight_flat + ((ii*weight_step + oi_1) * 9) + oi_0;
565 						dst[0*vec_width] = src0[0];
566 						dst[1*vec_width] = src0[1];
567 						dst[2*vec_width] = src0[2];
568 
569 						dst[3*vec_width] = src1[0];
570 						dst[4*vec_width] = src1[1];
571 						dst[5*vec_width] = src1[2];
572 
573 						dst[6*vec_width] = src2[0];
574 						dst[7*vec_width] = src2[1];
575 						dst[8*vec_width] = src2[2];
576 					}
577 				}
578 			}
579 		}
580 
581 		bool compare_result = false;
582 
583 #ifdef COMPARE_RESULT
584 		compare_result = true;
585 #endif
586 
587 		size_t in_size = size.width * size.height * sizeof(float) * nInputPlanes;
588 		size_t out_size = size.width * size.height * sizeof(float) * nOutputPlanes;
589 
590 		if (compare_result)
591 		{
592 			Buffer *packed_output_cv_buf = new Buffer(env, sizeof(float) * size.width * size.height * nOutputPlanes);
593 
594 			double t0 = getsec();
595 			filter_CV(env, packed_input_buf, packed_output_cv_buf, size);
596 			//filter_FMA_impl(packed_input, packed_output_cv,
597 			//		nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size, nJob);
598 			double t1 = getsec();
599 
600 			/* 3x3 = 9 fma */
601 			double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes;
602 
603 			if (proc->type == W2XCONV_PROC_OPENCL)
604 			{
605 				filter_OpenCL_impl
606 				(
607 					env,
608 					packed_input_buf,
609 					packed_output_buf,
610 					nInputPlanes,
611 					nOutputPlanes,
612 					fbiases_flat,
613 					weight_flat,
614 					size.width,
615 					size.height,
616 					nJob
617 				);
618 			}
619 			else if (proc->type == W2XCONV_PROC_CUDA)
620 			{
621 				filter_CUDA_impl
622 				(
623 					env,
624 					packed_input_buf,
625 					packed_output_buf,
626 					nInputPlanes,
627 					nOutputPlanes,
628 					fbiases_flat,
629 					weight_flat,
630 					size.width,
631 					size.height,
632 					nJob
633 				);
634 			}
635 			else
636 			{
637 				const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size);
638 				float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env);
639 
640 				switch (proc->sub_type)
641 				{
642 #ifdef X86OPT
643 					case W2XCONV_PROC_HOST_FMA:
644 					{
645 						filter_FMA_impl(env, packed_input, packed_output,
646 								nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
647 								size.width, size.height, nJob);
648 						break;
649 					}
650 					case W2XCONV_PROC_HOST_AVX:
651 					{
652 						filter_AVX_impl(env, packed_input, packed_output,
653 								nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
654 								size.width, size.height, nJob);
655 						break;
656 					}
657 					case W2XCONV_PROC_HOST_SSE3:
658 					{
659 						filter_SSE_impl(env, packed_input, packed_output,
660 								nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
661 								size.width, size.height, nJob);
662 						break;
663 					}
664 #endif
665 #ifdef ARMOPT
666 					case W2XCONV_PROC_HOST_NEON:
667 					{
668 						filter_NEON_impl(env, packed_input, packed_output,
669 								nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
670 								size.width, size.height, nJob);
671 						break;
672 					}
673 #endif
674 #ifdef PPCOPT
675 					case W2XCONV_PROC_HOST_ALTIVEC:
676 					{
677 						filter_AltiVec_impl(env, packed_input, packed_output,
678 								nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
679 								size.width, size.height, nJob);
680 						break;
681 					}
682 #endif
683 					default:
684 					{
685 						filter_CV(env, packed_input_buf, packed_output_buf, size);
686 						break;
687 					}
688 				}
689 			}
690 
691 			double t2 = getsec();
692 
693 			printf("(w=%d,h=%d) (ip=%d,op=%d) %f %f %f[gflops]\n", size.width, size.height, nInputPlanes, nOutputPlanes, t1-t0, t2-t1, ops/(1000*1000*1000));
694 			printf("ver2 : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1));
695 			printf("orig : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t1-t0));
696 			int error_count = 0;
697 
698 			float *packed_output_cv = (float*)packed_output_cv_buf->get_read_ptr_host(env, out_size);
699 			float *packed_output = (float*)packed_output_buf->get_read_ptr_host(env, out_size);
700 
701 			for (int i=0; i<size.width * size.height * nOutputPlanes; i++)
702 			{
703 				float v0 = packed_output_cv[i];
704 				float v1 = packed_output[i];
705 				float d = fabs(v0 - v1);
706 
707 				float r0 = d/fabs(v0);
708 				float r1 = d/fabs(v1);
709 
710 				float r = (std::max)(r0, r1);
711 
712 				if (r > 0.1f && d > 0.000001f)
713 				{
714 					int plane = i % nOutputPlanes;
715 					int pixpos = i / nOutputPlanes;
716 					int xpos = pixpos % size.width;
717 					int ypos = pixpos / size.width;
718 
719 					printf("d=%.20f %.20f %.20f @ (%d,%d,%d,%d) \n",r, v0, v1, xpos, ypos, plane, i);
720 					error_count++;
721 
722 					if (error_count >= 256)
723 					{
724 						exit(1);
725 					}
726 				}
727 			}
728 
729 			if (error_count != 0)
730 			{
731 				exit(1);
732 			}
733 
734 			delete packed_output_cv_buf;
735 		}
736 		else
737 		{
738 			if (proc->type == W2XCONV_PROC_OPENCL)
739 			{
740 				filter_OpenCL_impl
741 				(
742 					env,
743 					packed_input_buf,
744 					packed_output_buf,
745 					nInputPlanes,
746 					nOutputPlanes,
747 					fbiases_flat,
748 					weight_flat,
749 					size.width,
750 					size.height,
751 					nJob
752 				);
753 			}
754 			else if (proc->type == W2XCONV_PROC_CUDA)
755 			{
756 				filter_CUDA_impl
757 				(
758 					env,
759 					packed_input_buf,
760 					packed_output_buf,
761 					nInputPlanes,
762 					nOutputPlanes,
763 					fbiases_flat,
764 					weight_flat,
765 					size.width,
766 					size.height,
767 					nJob
768 				);
769 			}
770 			else
771 			{
772 				const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size);
773 				float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env);
774 
775 				switch (proc->sub_type)
776 				{
777 #ifdef X86OPT
778 					case W2XCONV_PROC_HOST_FMA:
779 					{
780 						filter_FMA_impl
781 						(
782 							env,
783 							packed_input,
784 							packed_output,
785 							nInputPlanes,
786 							nOutputPlanes,
787 							fbiases_flat,
788 							weight_flat,
789 							size.width,
790 							size.height,
791 							nJob
792 						);
793 						break;
794 					}
795 					case W2XCONV_PROC_HOST_AVX:
796 					{
797 						filter_AVX_impl
798 						(
799 							env,
800 							packed_input,
801 							packed_output,
802 							nInputPlanes,
803 							nOutputPlanes,
804 							fbiases_flat,
805 							weight_flat,
806 							size.width,
807 							size.height,
808 							nJob
809 						);
810 						break;
811 					}
812 					case W2XCONV_PROC_HOST_SSE3:
813 					{
814 						filter_SSE_impl
815 						(
816 							env,
817 							packed_input,
818 							packed_output,
819 							nInputPlanes,
820 							nOutputPlanes,
821 							fbiases_flat,
822 							weight_flat,
823 							size.width,
824 							size.height,
825 							nJob
826 						);
827 						break;
828 					}
829 #endif
830 #ifdef ARMOPT
831 					case W2XCONV_PROC_HOST_NEON:
832 					{
833 						filter_NEON_impl
834 						(
835 							env,
836 							packed_input,
837 							packed_output,
838 							nInputPlanes,
839 							nOutputPlanes,
840 							fbiases_flat,
841 							weight_flat,
842 							size.width,
843 							size.height,
844 							nJob
845 						);
846 						break;
847 					}
848 #endif
849 #ifdef PPCOPT
850 					case W2XCONV_PROC_HOST_ALTIVEC:
851 					{
852 						filter_AltiVec_impl
853 						(
854 							env,
855 							packed_input,
856 							packed_output,
857 							nInputPlanes,
858 							nOutputPlanes,
859 							fbiases_flat,
860 							weight_flat,
861 							size.width,
862 							size.height,
863 							nJob
864 						);
865 						break;
866 					}
867 #endif
868 					default:
869 					{
870 						filter_CV(env, packed_input_buf, packed_output_buf, size);
871 						break;
872 					}
873 				}
874 			}
875 		}
876 
877 		w2xc_aligned_free(fbiases_flat);
878 		w2xc_aligned_free(weight_flat);
879 
880 		return true;
881 	}
882 
filter(W2XConv * conv,ComputeEnv * env,Buffer * packed_input_buf,Buffer * packed_output_buf,W2Size const & size)883 	bool Model::filter (W2XConv *conv, ComputeEnv *env, Buffer *packed_input_buf, Buffer *packed_output_buf, W2Size const &size)
884 	{
885 		bool ret;
886 
887 		bool avx_available = true;
888 		bool cl_available = true;
889 		bool cuda_available = true;
890 
891 		if (nOutputPlanes > GPU_VEC_WIDTH)
892 		{
893 			cl_available = false;
894 			cuda_available = false;
895 		}
896 
897 		if (nOutputPlanes == 32 && nInputPlanes == 1)
898 		{
899 			/* i1 o32 filter */
900 		}
901 		else if (nOutputPlanes == 1 && nInputPlanes == 128)
902 		{
903 			/* i128 o32 filter */
904 		}
905 		else if (nOutputPlanes == 32 && nInputPlanes == 3)
906 		{
907 			/* i3 o32 filter */
908 		}
909 		else if (nOutputPlanes == 3 && nInputPlanes == 128)
910 		{
911 			/* i128 o3 filter */
912 		}
913 		else
914 		{
915 			if (nInputPlanes & 1)
916 			{
917 				cl_available = false;
918 				cuda_available = false;
919 				avx_available = false;
920 			}
921 
922 			if (nOutputPlanes & 31)
923 			{
924 				cl_available = false;
925 				cuda_available = false;
926 				avx_available = false;
927 			}
928 
929 			if (nInputPlanes == 32 || nInputPlanes == 64 || nInputPlanes == 128)
930 			{
931 				/* ok */
932 			}
933 			else
934 			{
935 				cuda_available = false;
936 			}
937 		}
938 
939 		//printf("%d %d %d\n",
940 		//       (int)cuda_available,
941 		//       (int)cl_available,
942 		//       (int)avx_available);
943 
944 		const struct W2XConvProcessor *proc = conv->target_processor;
945 
946 		if ((cl_available && proc->type == W2XCONV_PROC_OPENCL) ||
947 			(cuda_available && proc->type == W2XCONV_PROC_CUDA) ||
948 			(avx_available && proc->type == W2XCONV_PROC_HOST))
949 		{
950 			ret = filter_AVX_OpenCL(conv, env, packed_input_buf, packed_output_buf, size);
951 		}
952 		else
953 		{
954 			ret = filter_CV(env, packed_input_buf, packed_output_buf, size);
955 		}
956 
957 		return ret;
958 	}
959 
loadModelFromJSONObject(picojson::object & jsonObj)960 	bool Model::loadModelFromJSONObject(picojson::object &jsonObj) {
961 
962 		// nInputPlanes,nOutputPlanes,kernelSize have already set.
963 		int matProgress = 0;
964 		picojson::array &wOutputPlane = jsonObj["weight"].get<picojson::array>();
965 
966 		// setting weight matrices
967 		for (auto&& wInputPlaneV : wOutputPlane)
968 		{
969 			picojson::array &wInputPlane = wInputPlaneV.get<picojson::array>();
970 
971 			for (auto&& weightMatV : wInputPlane)
972 			{
973 				picojson::array &weightMat = weightMatV.get<picojson::array>();
974 				W2Mat writeMatrix(kernelSize, kernelSize, CV_32FC1);
975 
976 				for (int writingRow = 0; writingRow < kernelSize; writingRow++)
977 				{
978 					auto& weightMatRowV = weightMat.at(writingRow);
979 					picojson::array &weightMatRow = weightMatRowV.get<
980 							picojson::array>();
981 
982 					for (int index = 0; index < kernelSize; index++)
983 					{
984 						writeMatrix.ptr<float>(writingRow)[index] = (float) weightMatRow[index].get<double>();
985 					} // for(weightMatRow) (writing 1 row finished)
986 
987 				} // for(weightMat) (writing 1 matrix finished)
988 
989 				weights.push_back(std::move(writeMatrix));
990 				matProgress++;
991 			} // for(wInputPlane) (writing matrices in set of wInputPlane finished)
992 
993 		} //for(wOutputPlane) (writing all matrices finished)
994 
995 		// setting biases
996 		picojson::array biasesData = jsonObj["bias"].get<picojson::array>();
997 
998 		for (int index = 0; index < nOutputPlanes; index++)
999 		{
1000 			biases[index] = biasesData[index].get<double>();
1001 		}
1002 
1003 		return true;
1004 	}
1005 
1006 #ifdef HAVE_OPENCV
filterWorker(std::vector<W2Mat> & inputPlanes_w2,std::vector<W2Mat> & weightMatrices_w2,std::vector<W2Mat> & outputPlanes_w2,unsigned int beginningIndex,unsigned int nWorks)1007 	bool Model::filterWorker
1008 	(
1009 		std::vector<W2Mat> &inputPlanes_w2,
1010 		std::vector<W2Mat> &weightMatrices_w2,
1011 		std::vector<W2Mat> &outputPlanes_w2,
1012 		unsigned int beginningIndex,
1013 		unsigned int nWorks
1014 	)
1015 	{
1016 		std::vector<cv::Mat> inputPlanes;
1017 		std::vector<cv::Mat> weightMatrices;
1018 		std::vector<cv::Mat> outputPlanes;
1019 
1020 		extract_viewlist_to_cvmat(inputPlanes, inputPlanes_w2);
1021 		extract_viewlist_to_cvmat(weightMatrices, weightMatrices_w2);
1022 		extract_viewlist_to_cvmat(outputPlanes, outputPlanes_w2);
1023 
1024 		cv::Size ipSize = inputPlanes[0].size();
1025 		// filter processing
1026 		// input : inputPlanes
1027 		// kernel : weightMatrices
1028 
1029 		for (int opIndex = beginningIndex; opIndex < (int)(beginningIndex + nWorks); opIndex++)
1030 		{
1031 			int wMatIndex = nInputPlanes * opIndex;
1032 			cv::Mat outputPlane = cv::Mat::zeros(ipSize, CV_32FC1);
1033 			cv::Mat &uIntermediatePlane = outputPlane; // all zero matrix
1034 
1035 			for (int ipIndex = 0; ipIndex < nInputPlanes; ipIndex++)
1036 			{
1037 				cv::Mat &uInputPlane = inputPlanes[ipIndex];
1038 				cv::Mat &weightMatrix = weightMatrices[wMatIndex + ipIndex];
1039 				cv::Mat filterOutput = cv::Mat::zeros(ipSize, CV_32FC1);
1040 
1041 				cv::filter2D(uInputPlane, filterOutput, -1, weightMatrix, cv::Point(-1, -1), 0.0, cv::BORDER_REPLICATE);
1042 
1043 				cv::add(uIntermediatePlane, filterOutput, uIntermediatePlane);
1044 			}
1045 
1046 			cv::add(uIntermediatePlane, biases[opIndex], uIntermediatePlane);
1047 			cv::Mat moreThanZero = cv::Mat(ipSize,CV_32FC1,0.0);
1048 			cv::Mat lessThanZero = cv::Mat(ipSize,CV_32FC1,0.0);
1049 			(cv::max)(uIntermediatePlane, 0.0, moreThanZero);
1050 			(cv::min)(uIntermediatePlane, 0.0, lessThanZero);
1051 			cv::scaleAdd(lessThanZero, 0.1, moreThanZero, uIntermediatePlane);
1052 			uIntermediatePlane.copyTo(outputPlanes[opIndex]);
1053 
1054 		} // for index
1055 
1056 		return true;
1057 	}
1058 #endif
1059 
1060 	modelUtility * modelUtility::instance = nullptr;
1061 
getInstance()1062 	modelUtility& modelUtility::getInstance()
1063 	{
1064 		if(instance == nullptr)
1065 		{
1066 			instance = new modelUtility();
1067 		}
1068 		return *instance;
1069 	}
1070 
Model(FILE * binfp)1071 	Model::Model(FILE *binfp)
1072 	{
1073 		uint32_t nInputPlanes, nOutputPlanes;
1074 
1075 		fread(&nInputPlanes, 4, 1, binfp);
1076 		fread(&nOutputPlanes, 4, 1, binfp);
1077 
1078 		this->nInputPlanes = nInputPlanes;
1079 		this->nOutputPlanes = nOutputPlanes;
1080 		this->kernelSize = 3;
1081 		this->weights.clear();
1082 		this->biases.clear();
1083 
1084 		// setting weight matrices
1085 		for (uint32_t oi=0; oi<nOutputPlanes; oi++)
1086 		{
1087 			for (uint32_t ii=0; ii<nInputPlanes; ii++)
1088 			{
1089 				W2Mat writeMatrix(kernelSize, kernelSize, CV_32FC1);
1090 
1091 				for (int yi=0; yi<3; yi++)
1092 				{
1093 					for (int xi=0; xi<3; xi++)
1094 					{
1095 						double v;
1096 						fread(&v, 8, 1, binfp);
1097 						writeMatrix.at<float>(yi, xi) = (float) v;
1098 					}
1099 				}
1100 
1101 				this->weights.emplace_back(std::move(writeMatrix));
1102 			}
1103 		}
1104 
1105 		for (uint32_t oi = 0; oi < nOutputPlanes; oi++)
1106 		{
1107 			double v;
1108 			fread(&v, 8, 1, binfp);
1109 			biases.push_back(v);
1110 		}
1111 	}
1112 
Model(int nInputPlane,int nOutputPlane,const float * coef_list,const float * bias)1113 	Model::Model(int nInputPlane, int nOutputPlane, const float *coef_list, const float *bias)
1114 	{
1115 		this->nInputPlanes = nInputPlane;
1116 		this->nOutputPlanes = nOutputPlane;
1117 		this->kernelSize = 3;
1118 		this->weights.clear();
1119 		this->biases.clear();
1120 
1121 		int cur = 0;
1122 
1123 		// setting weight matrices
1124 		for (uint32_t oi = 0; oi < (uint32_t)nOutputPlanes; oi++)
1125 		{
1126 			for (uint32_t ii = 0; ii < (uint32_t)nInputPlanes; ii++)
1127 			{
1128 				W2Mat writeMatrix(kernelSize, kernelSize, CV_32FC1);
1129 
1130 				for (int yi = 0; yi < 3; yi++)
1131 				{
1132 					for (int xi = 0; xi < 3; xi++)
1133 					{
1134 						double v = coef_list[cur++];
1135 						writeMatrix.at<float>(yi, xi) = (float) v;
1136 					}
1137 				}
1138 
1139 				this->weights.emplace_back(std::move(writeMatrix));
1140 			}
1141 		}
1142 
1143 		for (uint32_t oi = 0; oi < (uint32_t)nOutputPlanes; oi++)
1144 		{
1145 			double v = bias[oi];
1146 			biases.push_back(v);
1147 		}
1148 	}
generateModelFromJSON(const _tstring & fileName,std::vector<std::unique_ptr<Model>> & models)1149 	bool modelUtility::generateModelFromJSON
1150 	(
1151 		const _tstring &fileName,
1152 		std::vector<std::unique_ptr<Model> > &models
1153 	)
1154 	{
1155 		_tstring binpath = fileName + _T(".bin");
1156 		FILE *binfp = _tfopen(binpath.c_str(), _T("rb"));
1157 
1158 		if (binfp)
1159 		{
1160 			bool need_update = update_test(binpath.c_str(), fileName.c_str());
1161 
1162 			if (need_update)
1163 			{
1164 				fclose(binfp);
1165 				binfp = NULL;
1166 			}
1167 		}
1168 
1169 		if (binfp)
1170 		{
1171 			uint32_t nModel;
1172 
1173 			fread(&nModel, 4, 1, binfp);
1174 
1175 			for (uint32_t i=0; i<nModel; i++)
1176 			{
1177 				std::unique_ptr<Model> m = std::unique_ptr<Model>(
1178 					new Model(binfp));
1179 				models.push_back(std::move(m));
1180 			}
1181 
1182 			fclose(binfp);
1183 		}
1184 		else
1185 		{
1186 			std::ifstream jsonFile;
1187 
1188 			jsonFile.open(fileName);
1189 			if (!jsonFile.is_open())
1190 			{
1191 				std::string fname = _tstr2str(fileName);
1192 				std::cerr << "Error : couldn't open " << fname << std::endl;
1193 				return false;
1194 			}
1195 
1196 			picojson::value jsonValue;
1197 			jsonFile >> jsonValue;
1198 
1199 			std::string errMsg = picojson::get_last_error();
1200 
1201 			if (!errMsg.empty())
1202 			{
1203 				std::cerr << "Error : PicoJSON Error : " << errMsg << std::endl;
1204 				return false;
1205 			}
1206 
1207 			picojson::array& objectArray = jsonValue.get<picojson::array>();
1208 
1209 			for (auto&& obj : objectArray)
1210 			{
1211 				std::unique_ptr<Model> m = std::unique_ptr<Model>(
1212 					new Model(obj.get<picojson::object>()));
1213 				models.push_back(std::move(m));
1214 			}
1215 
1216 			binfp = _tfopen(binpath.c_str(), _T("wb"));
1217 			if (binfp)
1218 			{
1219 				size_t nModel = objectArray.size();
1220 				fwrite(&nModel, 4, 1, binfp);
1221 
1222 				for (auto&& m : models)
1223 				{
1224 					uint32_t nInputPlanes = m->getNInputPlanes();
1225 					uint32_t nOutputPlanes = m->getNOutputPlanes();
1226 
1227 					fwrite(&nInputPlanes, 4, 1, binfp);
1228 					fwrite(&nOutputPlanes, 4, 1, binfp);
1229 
1230 					std::vector<W2Mat> &weights = m->getWeigts();
1231 
1232 					int nw = (int) weights.size();
1233 
1234 					for (int wi = 0; wi < nw; wi++)
1235 					{
1236 						W2Mat &wm = weights[wi];
1237 						double v;
1238 						v = wm.at<float>(0,0);
1239 						fwrite(&v, 1, 8, binfp);
1240 						v = wm.at<float>(0,1);
1241 						fwrite(&v, 1, 8, binfp);
1242 						v = wm.at<float>(0,2);
1243 						fwrite(&v, 1, 8, binfp);
1244 
1245 						v = wm.at<float>(1,0);
1246 						fwrite(&v, 1, 8, binfp);
1247 						v = wm.at<float>(1,1);
1248 						fwrite(&v, 1, 8, binfp);
1249 						v = wm.at<float>(1,2);
1250 						fwrite(&v, 1, 8, binfp);
1251 
1252 						v = wm.at<float>(2,0);
1253 						fwrite(&v, 1, 8, binfp);
1254 						v = wm.at<float>(2,1);
1255 						fwrite(&v, 1, 8, binfp);
1256 						v = wm.at<float>(2,2);
1257 						fwrite(&v, 1, 8, binfp);
1258 					}
1259 
1260 					std::vector<double> &b = m->getBiases();
1261 					fwrite(&b[0], 8, b.size(), binfp);
1262 				}
1263 
1264 				fclose(binfp);
1265 			}
1266 		}
1267 		return true;
1268 	}
1269 
generateModelFromMEM(int layer_depth,int num_input_plane,const int * num_map,const float * coef_list,const float * bias,std::vector<std::unique_ptr<Model>> & models)1270 	void modelUtility::generateModelFromMEM
1271 	(
1272 		int layer_depth,
1273 		int num_input_plane,
1274 		const int *num_map, // num_map[layer_depth]
1275 		const float *coef_list, // coef_list[layer_depth][num_map][3x3]
1276 		const float *bias, // bias[layer_depth][num_map]
1277 		std::vector<std::unique_ptr<Model> > &models
1278 	)
1279 	{
1280 		int cur = 0;
1281 		models.resize(layer_depth);
1282 
1283 		models[0] = std::unique_ptr<Model>(new Model(num_input_plane, num_map[0], &coef_list[0], &bias[0]));
1284 
1285 		cur += num_map[0];
1286 
1287 		for (int li = 1; li < layer_depth; li++)
1288 		{
1289 			models[li] = std::unique_ptr<Model>(new Model(num_map[li - 1], num_map[li], &coef_list[cur * 3 * 3], &bias[cur]));
1290 			cur += num_map[li];
1291 		}
1292 	}
1293 
setNumberOfJobs(int setNJob)1294 	bool modelUtility::setNumberOfJobs(int setNJob)
1295 	{
1296 		if(setNJob < 1)
1297 		{
1298 			return false;
1299 		}
1300 
1301 		nJob = setNJob;
1302 
1303 		return true;
1304 	};
1305 
getNumberOfJobs()1306 	int modelUtility::getNumberOfJobs()
1307 	{
1308 		return nJob;
1309 	}
1310 
1311 	// for debugging
printWeightMatrix()1312 	void Model::printWeightMatrix()
1313 	{
1314 
1315 		for (auto&& weightMatrix : weights)
1316 		{
1317 			//std::cout << weightMatrix << std::endl;
1318 		}
1319 
1320 	}
1321 
printBiases()1322 	void Model::printBiases() {
1323 
1324 		for (auto&& bias : biases)
1325 		{
1326 			std::cout << bias << std::endl;
1327 		}
1328 	}
1329 }
1330