1 /* 2 * The MIT License (MIT) 3 * This file is part of waifu2x-converter-cpp 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a copy 6 * of this software and associated documentation files (the "Software"), to deal 7 * in the Software without restriction, including without limitation the rights 8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 * copies of the Software, and to permit persons to whom the Software is 10 * furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in all 13 * copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24 #include "modelHandler.hpp" 25 // #include <iostream> in modelHandler.hpp 26 #include "cvwrap.hpp" 27 #include <fstream> 28 #include <thread> 29 #include <atomic> 30 #include "sec.hpp" 31 //#include "threadPool.hpp" 32 #include "common.hpp" 33 #include "filters.hpp" 34 #include "params.h" 35 36 namespace w2xc 37 { 38 getNInputPlanes()39 int Model::getNInputPlanes() 40 { 41 return nInputPlanes; 42 } 43 getNOutputPlanes()44 int Model::getNOutputPlanes() 45 { 46 return nOutputPlanes; 47 } 48 filter_CV(ComputeEnv * env,Buffer * packed_input_buf,Buffer * packed_output_buf,const W2Size & size)49 bool Model::filter_CV 50 ( 51 ComputeEnv *env, 52 Buffer *packed_input_buf, 53 Buffer *packed_output_buf, 54 const W2Size &size 55 ) 56 { 57 size_t in_size = sizeof(float) * size.width * size.height * nInputPlanes; 58 const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size); 59 float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env); 60 61 //FutureNote: Should this not just be commented out? Or removed ? (ther's git history if we ever need it again) 62 #if 0 // HAVE_OPENCV 63 std::vector<cv::Mat> outputPlanes; 64 std::vector<cv::Mat> inputPlanes; 65 66 for (int i = 0; i < nInputPlanes; i++) 67 { 68 inputPlanes.push_back(cv::Mat::zeros(cvSize_from_w2(size), CV_32FC1)); 69 } 70 71 std::vector<W2Mat> inputPlanes_2(extract_viewlist_from_cvmat(inputPlanes)); 72 unpack_mat(inputPlanes_2, packed_input, size.width, size.height, nInputPlanes); 73 74 outputPlanes.clear(); 75 76 for (int i = 0; i < nOutputPlanes; i++) 77 { 78 outputPlanes.push_back(cv::Mat::zeros(cvSize_from_w2(size), CV_32FC1)); 79 } 80 81 // filter job issuing 82 std::vector<std::thread> workerThreads; 83 std::vector<W2Mat> inputPlanes_w2 = extract_viewlist_from_cvmat(inputPlanes); 84 std::vector<W2Mat> outputPlanes_w2 = extract_viewlist_from_cvmat(outputPlanes); 85 86 int worksPerThread = nOutputPlanes / nJob; 87 int nJob = modelUtility::getInstance().getNumberOfJobs(); 88 89 for (int idx = 0; idx < nJob; idx++) 90 { 91 if (!(idx == (nJob - 1) && worksPerThread * nJob != nOutputPlanes)) 92 { 93 workerThreads.push_back 94 ( 95 std::thread 96 ( 97 &Model::filterWorker, this, 98 std::ref(inputPlanes_w2), std::ref(weights), 99 std::ref(outputPlanes_w2), 100 static_cast<unsigned int>(worksPerThread * idx), 101 static_cast<unsigned int>(worksPerThread) 102 ) 103 ); 104 } 105 else 106 { 107 // worksPerThread * nJob != nOutputPlanes 108 workerThreads.push_back( 109 std::thread( 110 &Model::filterWorker, this, 111 std::ref(inputPlanes_w2), std::ref(weights), 112 std::ref(outputPlanes_w2), 113 static_cast<unsigned int>(worksPerThread * idx), 114 static_cast<unsigned int>(nOutputPlanes - worksPerThread * idx) 115 ) 116 ); 117 } 118 } 119 120 // wait for finishing jobs 121 for (auto& th : workerThreads) 122 { 123 th.join(); 124 } 125 126 std::vector<W2Mat> outputPlanes_2(extract_viewlist_from_cvmat(outputPlanes)); 127 pack_mat(packed_output, outputPlanes_2, size.width, size.height, nOutputPlanes); 128 129 return true; 130 #else 131 std::atomic<int> yi_shared(0); 132 133 auto thread_func = [&]() 134 { 135 int w = size.width; 136 int h = size.height; 137 138 while (true) 139 { 140 int yi = yi_shared++; 141 142 if (yi >= h) 143 { 144 break; 145 } 146 147 float *out_line = packed_output + w*nOutputPlanes * yi; 148 149 int yi0 = yi-1; 150 int yi1 = yi; 151 int yi2 = yi+1; 152 153 if (yi == 0) 154 { 155 yi0 = 0; 156 } 157 158 if (yi == h-1) 159 { 160 yi2 = yi1; 161 } 162 163 const float *in_line0 = packed_input + w * nInputPlanes * yi0; 164 const float *in_line1 = packed_input + w * nInputPlanes * yi1; 165 const float *in_line2 = packed_input + w * nInputPlanes * yi2; 166 167 for (int xi=0; xi<w; xi++) 168 { 169 int x0 = xi-1; 170 int x1 = xi; 171 int x2 = xi+1; 172 173 if (xi == 0) 174 { 175 x0 = 0; 176 } 177 178 if (xi == w-1) 179 { 180 x2 = x1; 181 } 182 183 const float *in00 = in_line0 + x0 * nInputPlanes; 184 const float *in01 = in_line0 + x1 * nInputPlanes; 185 const float *in02 = in_line0 + x2 * nInputPlanes; 186 187 const float *in10 = in_line1 + x0 * nInputPlanes; 188 const float *in11 = in_line1 + x1 * nInputPlanes; 189 const float *in12 = in_line1 + x2 * nInputPlanes; 190 191 const float *in20 = in_line2 + x0 * nInputPlanes; 192 const float *in21 = in_line2 + x1 * nInputPlanes; 193 const float *in22 = in_line2 + x2 * nInputPlanes; 194 195 for (int oi=0; oi<nOutputPlanes; oi++) 196 { 197 float sum = 0; 198 199 for (int ii=0; ii<nInputPlanes; ii++) 200 { 201 int wMatIndex = nInputPlanes * oi + ii; 202 const float *w = weights[wMatIndex].ptr<float>(0); 203 204 sum += in00[ii] * w[0]; 205 sum += in01[ii] * w[1]; 206 sum += in02[ii] * w[2]; 207 208 sum += in10[ii] * w[3]; 209 sum += in11[ii] * w[4]; 210 sum += in12[ii] * w[5]; 211 212 sum += in20[ii] * w[6]; 213 sum += in21[ii] * w[7]; 214 sum += in22[ii] * w[8]; 215 } 216 217 float v = sum; 218 v += (float) biases[oi]; 219 float mtz = (std::max)(v, 0.0f); 220 float ltz = (std::min)(v, 0.0f); 221 v = ltz*0.1f + mtz; 222 223 out_line[xi*nOutputPlanes + oi] = v; 224 } 225 } 226 } 227 }; 228 229 int w = size.width; 230 int h = size.height; 231 std::vector<std::thread> workerThreads; 232 int nJob = modelUtility::getInstance().getNumberOfJobs(); 233 234 for (int ji=0; ji<nJob; ji++) 235 { 236 workerThreads.emplace_back(std::thread(thread_func)); 237 } 238 239 for (auto&th : workerThreads) 240 { 241 th.join(); 242 } 243 #endif 244 return true; 245 } 246 247 //#define COMPARE_RESULT filter_AVX_OpenCL(W2XConv * conv,ComputeEnv * env,Buffer * packed_input_buf,Buffer * packed_output_buf,const W2Size & size)248 bool Model::filter_AVX_OpenCL 249 ( 250 W2XConv *conv, 251 ComputeEnv *env, 252 Buffer *packed_input_buf, 253 Buffer *packed_output_buf, 254 const W2Size &size 255 ) 256 { 257 int vec_width; 258 int weight_step; 259 int nJob = modelUtility::getInstance().getNumberOfJobs(); 260 const struct W2XConvProcessor *proc = conv->target_processor; 261 262 bool gpu = (proc->type == W2XCONV_PROC_OPENCL) || (proc->type == W2XCONV_PROC_CUDA); 263 264 if (gpu) 265 { 266 weight_step = GPU_VEC_WIDTH; 267 vec_width = GPU_VEC_WIDTH; 268 } 269 else 270 { 271 weight_step = nOutputPlanes; 272 vec_width = VEC_WIDTH; 273 } 274 275 float *weight_flat = (float*)w2xc_aligned_malloc(sizeof(float)*nInputPlanes*weight_step*3*3, 64); 276 float *fbiases_flat = (float*)w2xc_aligned_malloc(sizeof(float) * biases.size(), 64); 277 278 for (int i=0; i<(int)biases.size(); i++) 279 { 280 fbiases_flat[i] = (float) biases[i]; 281 } 282 283 if (nOutputPlanes == 1) 284 { 285 if (gpu) 286 { 287 for (int ii=0; ii<nInputPlanes; ii++) 288 { 289 W2Mat &wm = weights[ii]; 290 const float *src0 = wm.ptr<float>(0); 291 const float *src1 = wm.ptr<float>(1); 292 const float *src2 = wm.ptr<float>(2); 293 294 float *dst = weight_flat + ii * 9; 295 dst[0] = src0[0]; 296 dst[1] = src0[1]; 297 dst[2] = src0[2]; 298 299 dst[3] = src1[0]; 300 dst[4] = src1[1]; 301 dst[5] = src1[2]; 302 303 dst[6] = src2[0]; 304 dst[7] = src2[1]; 305 dst[8] = src2[2]; 306 307 } 308 } 309 else 310 { 311 for (int ii=0; ii<nInputPlanes; ii++) 312 { 313 W2Mat &wm = weights[ii]; 314 const float *src0 = wm.ptr<float>(0); 315 const float *src1 = wm.ptr<float>(1); 316 const float *src2 = wm.ptr<float>(2); 317 318 int ii_0 = ii % vec_width; 319 int ii_1 = (ii / vec_width) * vec_width; 320 321 float *dst = weight_flat + ii_1 * 9 + ii_0; 322 dst[0 * vec_width] = src0[0]; 323 dst[1 * vec_width] = src0[1]; 324 dst[2 * vec_width] = src0[2]; 325 326 dst[3 * vec_width] = src1[0]; 327 dst[4 * vec_width] = src1[1]; 328 dst[5 * vec_width] = src1[2]; 329 330 dst[6 * vec_width] = src2[0]; 331 dst[7 * vec_width] = src2[1]; 332 dst[8 * vec_width] = src2[2]; 333 } 334 } 335 } 336 else if (gpu && nInputPlanes == 1) 337 { 338 for (int oi=0; oi<nOutputPlanes; oi++) 339 { 340 W2Mat &wm = weights[oi]; 341 const float *src0 = wm.ptr<float>(0); 342 const float *src1 = wm.ptr<float>(1); 343 const float *src2 = wm.ptr<float>(2); 344 345 float *dst = weight_flat + oi * 9; 346 dst[0] = src0[0]; 347 dst[1] = src0[1]; 348 dst[2] = src0[2]; 349 350 dst[3] = src1[0]; 351 dst[4] = src1[1]; 352 dst[5] = src1[2]; 353 354 dst[6] = src2[0]; 355 dst[7] = src2[1]; 356 dst[8] = src2[2]; 357 } 358 } 359 else if (nOutputPlanes == 3) 360 { 361 /* | o0 | o1 | o2 ... | 362 * |i0 i1 i2 ... i127|i0 i1 i2 ... i127| ... |*/ 363 for (int oi=0; oi<nOutputPlanes; oi++) 364 { 365 for (int ii=0; ii<nInputPlanes; ii++) 366 { 367 int mi = oi*nInputPlanes+ii; 368 W2Mat &wm = weights[mi]; 369 const float *src0 = wm.ptr<float>(0); 370 const float *src1 = wm.ptr<float>(1); 371 const float *src2 = wm.ptr<float>(2); 372 373 float *dst = weight_flat + (oi * nInputPlanes * 9) + ii; 374 dst[0*nInputPlanes] = src0[0]; 375 dst[1*nInputPlanes] = src0[1]; 376 dst[2*nInputPlanes] = src0[2]; 377 378 dst[3*nInputPlanes] = src1[0]; 379 dst[4*nInputPlanes] = src1[1]; 380 dst[5*nInputPlanes] = src1[2]; 381 382 dst[6*nInputPlanes] = src2[0]; 383 dst[7*nInputPlanes] = src2[1]; 384 dst[8*nInputPlanes] = src2[2]; 385 } 386 } 387 } 388 else if (gpu && (nInputPlanes == 3) && (nOutputPlanes == 32)) 389 { 390 /* | i0 | i1 | i2 .. iN-1| 391 * |o0 o1 o2 o3..o31|o0 .... o32| .... | 392 * |<- ->| 393 * | 32 | 394 * | x 9 | 395 */ 396 397 for (int oi=0; oi<nOutputPlanes; oi++) 398 { 399 for (int ii=0; ii<nInputPlanes; ii++) 400 { 401 int mi = oi*nInputPlanes+ii; 402 W2Mat &wm = weights[mi]; 403 const float *src0 = wm.ptr<float>(0); 404 const float *src1 = wm.ptr<float>(1); 405 const float *src2 = wm.ptr<float>(2); 406 407 float *dst = weight_flat + (ii * nOutputPlanes * 9) + oi; 408 dst[0*nOutputPlanes] = src0[0]; 409 dst[1*nOutputPlanes] = src0[1]; 410 dst[2*nOutputPlanes] = src0[2]; 411 412 dst[3*nOutputPlanes] = src1[0]; 413 dst[4*nOutputPlanes] = src1[1]; 414 dst[5*nOutputPlanes] = src1[2]; 415 416 dst[6*nOutputPlanes] = src2[0]; 417 dst[7*nOutputPlanes] = src2[1]; 418 dst[8*nOutputPlanes] = src2[2]; 419 } 420 } 421 } 422 else 423 { 424 bool simd_oplane = false; 425 bool simd_iplane = false; 426 int simd_vec_width = 0; 427 428 if (proc->type == W2XCONV_PROC_HOST) 429 { 430 switch (proc->sub_type) 431 { 432 case W2XCONV_PROC_HOST_SSE3: 433 { 434 simd_vec_width = 4; 435 simd_oplane = true; 436 break; 437 } 438 case W2XCONV_PROC_HOST_NEON: 439 { 440 simd_vec_width = 4; 441 simd_oplane = true; 442 break; 443 } 444 case W2XCONV_PROC_HOST_ALTIVEC: 445 { 446 simd_vec_width = 8; 447 simd_oplane = true; 448 break; 449 } 450 case W2XCONV_PROC_HOST_AVX: 451 case W2XCONV_PROC_HOST_FMA: 452 { 453 simd_vec_width = 8; 454 simd_oplane = true; 455 break; 456 } 457 } 458 } 459 460 simd_oplane = simd_oplane && (nInputPlanes%(simd_vec_width*4) == 0) && (nOutputPlanes%(simd_vec_width*2) == 0); 461 simd_iplane = simd_iplane && (nInputPlanes%(simd_vec_width*4) == 0) && (nOutputPlanes%(simd_vec_width*2) == 0); 462 463 if (simd_oplane || simd_iplane) 464 { 465 /* 466 * weight_chunk (16x32x3x4 = 6144[Byte]) 467 * (where op_block_size=16, ip_block_size=32) 468 * 469 * 111 oplane x16 470 * 16 16 .. (x16) ..16 iplane x32 471 * \ | / horiz x3 472 * oplane xnOutputPlane_block 473 * iplane xnInputPlane_block 474 * vert x3 475 */ 476 int ip_block_size; 477 int op_block_size; 478 479 if (simd_oplane) 480 { 481 ip_block_size = (simd_vec_width*4); 482 op_block_size = (simd_vec_width*2); 483 } 484 else { 485 ip_block_size = (simd_vec_width*2); 486 op_block_size = (simd_vec_width*4); 487 } 488 489 int nInputPlane_block = nInputPlanes/ip_block_size; 490 int nOutputPlane_block = nOutputPlanes/op_block_size; 491 492 float *dst = weight_flat; 493 494 for (int dposy=0; dposy<3; dposy++) 495 { 496 for (int ii0=0; ii0<nInputPlane_block; ii0++) 497 { 498 for (int oi0=0; oi0<nOutputPlane_block; oi0++) 499 { 500 for (int dposx=0; dposx<3; dposx++) 501 { 502 if (simd_oplane) 503 { 504 for (int ii1=0; ii1<ip_block_size; ii1++) 505 { 506 for (int oi1=0; oi1<op_block_size; oi1++) 507 { 508 int ii = ii0*ip_block_size + ii1; 509 int oi = oi0*op_block_size + oi1; 510 int mi = oi*nInputPlanes + ii; 511 512 W2Mat &wm = weights[mi]; 513 float &src = wm.at<float>(dposy, dposx); 514 *dst = src; 515 516 dst++; 517 } 518 } 519 } 520 else 521 { 522 for (int oi1=0; oi1<op_block_size; oi1++) 523 { 524 for (int ii1=0; ii1<ip_block_size; ii1++) 525 { 526 int ii = ii0*ip_block_size + ii1; 527 int oi = oi0*op_block_size + oi1; 528 int mi = oi*nInputPlanes + ii; 529 530 W2Mat &wm = weights[mi]; 531 float &src = wm.at<float>(dposy, dposx); 532 *dst = src; 533 534 dst++; 535 } 536 } 537 } 538 } 539 } 540 } 541 } 542 } 543 else 544 { 545 /* | i0 | i1 | i2 .. iN-1| i0 | i1 | .. 546 * |o0 o1 o2 o3|o0 o1 o2 o3| .... |o4 o5 o6 o7|o4 o5 o6 o7| .. 547 * |<- ->| 548 * | VEC_WIDTH | 549 * | x 9 | 550 */ 551 for (int oi=0; oi<nOutputPlanes; oi++) 552 { 553 for (int ii=0; ii<nInputPlanes; ii++) 554 { 555 int mi = oi*nInputPlanes+ii; 556 W2Mat &wm = weights[mi]; 557 const float *src0 = wm.ptr<float>(0); 558 const float *src1 = wm.ptr<float>(1); 559 const float *src2 = wm.ptr<float>(2); 560 561 int oi_0 = oi % vec_width; 562 int oi_1 = (oi / vec_width) * vec_width; 563 564 float *dst = weight_flat + ((ii*weight_step + oi_1) * 9) + oi_0; 565 dst[0*vec_width] = src0[0]; 566 dst[1*vec_width] = src0[1]; 567 dst[2*vec_width] = src0[2]; 568 569 dst[3*vec_width] = src1[0]; 570 dst[4*vec_width] = src1[1]; 571 dst[5*vec_width] = src1[2]; 572 573 dst[6*vec_width] = src2[0]; 574 dst[7*vec_width] = src2[1]; 575 dst[8*vec_width] = src2[2]; 576 } 577 } 578 } 579 } 580 581 bool compare_result = false; 582 583 #ifdef COMPARE_RESULT 584 compare_result = true; 585 #endif 586 587 size_t in_size = size.width * size.height * sizeof(float) * nInputPlanes; 588 size_t out_size = size.width * size.height * sizeof(float) * nOutputPlanes; 589 590 if (compare_result) 591 { 592 Buffer *packed_output_cv_buf = new Buffer(env, sizeof(float) * size.width * size.height * nOutputPlanes); 593 594 double t0 = getsec(); 595 filter_CV(env, packed_input_buf, packed_output_cv_buf, size); 596 //filter_FMA_impl(packed_input, packed_output_cv, 597 // nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size, nJob); 598 double t1 = getsec(); 599 600 /* 3x3 = 9 fma */ 601 double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes; 602 603 if (proc->type == W2XCONV_PROC_OPENCL) 604 { 605 filter_OpenCL_impl 606 ( 607 env, 608 packed_input_buf, 609 packed_output_buf, 610 nInputPlanes, 611 nOutputPlanes, 612 fbiases_flat, 613 weight_flat, 614 size.width, 615 size.height, 616 nJob 617 ); 618 } 619 else if (proc->type == W2XCONV_PROC_CUDA) 620 { 621 filter_CUDA_impl 622 ( 623 env, 624 packed_input_buf, 625 packed_output_buf, 626 nInputPlanes, 627 nOutputPlanes, 628 fbiases_flat, 629 weight_flat, 630 size.width, 631 size.height, 632 nJob 633 ); 634 } 635 else 636 { 637 const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size); 638 float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env); 639 640 switch (proc->sub_type) 641 { 642 #ifdef X86OPT 643 case W2XCONV_PROC_HOST_FMA: 644 { 645 filter_FMA_impl(env, packed_input, packed_output, 646 nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, 647 size.width, size.height, nJob); 648 break; 649 } 650 case W2XCONV_PROC_HOST_AVX: 651 { 652 filter_AVX_impl(env, packed_input, packed_output, 653 nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, 654 size.width, size.height, nJob); 655 break; 656 } 657 case W2XCONV_PROC_HOST_SSE3: 658 { 659 filter_SSE_impl(env, packed_input, packed_output, 660 nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, 661 size.width, size.height, nJob); 662 break; 663 } 664 #endif 665 #ifdef ARMOPT 666 case W2XCONV_PROC_HOST_NEON: 667 { 668 filter_NEON_impl(env, packed_input, packed_output, 669 nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, 670 size.width, size.height, nJob); 671 break; 672 } 673 #endif 674 #ifdef PPCOPT 675 case W2XCONV_PROC_HOST_ALTIVEC: 676 { 677 filter_AltiVec_impl(env, packed_input, packed_output, 678 nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, 679 size.width, size.height, nJob); 680 break; 681 } 682 #endif 683 default: 684 { 685 filter_CV(env, packed_input_buf, packed_output_buf, size); 686 break; 687 } 688 } 689 } 690 691 double t2 = getsec(); 692 693 printf("(w=%d,h=%d) (ip=%d,op=%d) %f %f %f[gflops]\n", size.width, size.height, nInputPlanes, nOutputPlanes, t1-t0, t2-t1, ops/(1000*1000*1000)); 694 printf("ver2 : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1)); 695 printf("orig : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t1-t0)); 696 int error_count = 0; 697 698 float *packed_output_cv = (float*)packed_output_cv_buf->get_read_ptr_host(env, out_size); 699 float *packed_output = (float*)packed_output_buf->get_read_ptr_host(env, out_size); 700 701 for (int i=0; i<size.width * size.height * nOutputPlanes; i++) 702 { 703 float v0 = packed_output_cv[i]; 704 float v1 = packed_output[i]; 705 float d = fabs(v0 - v1); 706 707 float r0 = d/fabs(v0); 708 float r1 = d/fabs(v1); 709 710 float r = (std::max)(r0, r1); 711 712 if (r > 0.1f && d > 0.000001f) 713 { 714 int plane = i % nOutputPlanes; 715 int pixpos = i / nOutputPlanes; 716 int xpos = pixpos % size.width; 717 int ypos = pixpos / size.width; 718 719 printf("d=%.20f %.20f %.20f @ (%d,%d,%d,%d) \n",r, v0, v1, xpos, ypos, plane, i); 720 error_count++; 721 722 if (error_count >= 256) 723 { 724 exit(1); 725 } 726 } 727 } 728 729 if (error_count != 0) 730 { 731 exit(1); 732 } 733 734 delete packed_output_cv_buf; 735 } 736 else 737 { 738 if (proc->type == W2XCONV_PROC_OPENCL) 739 { 740 filter_OpenCL_impl 741 ( 742 env, 743 packed_input_buf, 744 packed_output_buf, 745 nInputPlanes, 746 nOutputPlanes, 747 fbiases_flat, 748 weight_flat, 749 size.width, 750 size.height, 751 nJob 752 ); 753 } 754 else if (proc->type == W2XCONV_PROC_CUDA) 755 { 756 filter_CUDA_impl 757 ( 758 env, 759 packed_input_buf, 760 packed_output_buf, 761 nInputPlanes, 762 nOutputPlanes, 763 fbiases_flat, 764 weight_flat, 765 size.width, 766 size.height, 767 nJob 768 ); 769 } 770 else 771 { 772 const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size); 773 float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env); 774 775 switch (proc->sub_type) 776 { 777 #ifdef X86OPT 778 case W2XCONV_PROC_HOST_FMA: 779 { 780 filter_FMA_impl 781 ( 782 env, 783 packed_input, 784 packed_output, 785 nInputPlanes, 786 nOutputPlanes, 787 fbiases_flat, 788 weight_flat, 789 size.width, 790 size.height, 791 nJob 792 ); 793 break; 794 } 795 case W2XCONV_PROC_HOST_AVX: 796 { 797 filter_AVX_impl 798 ( 799 env, 800 packed_input, 801 packed_output, 802 nInputPlanes, 803 nOutputPlanes, 804 fbiases_flat, 805 weight_flat, 806 size.width, 807 size.height, 808 nJob 809 ); 810 break; 811 } 812 case W2XCONV_PROC_HOST_SSE3: 813 { 814 filter_SSE_impl 815 ( 816 env, 817 packed_input, 818 packed_output, 819 nInputPlanes, 820 nOutputPlanes, 821 fbiases_flat, 822 weight_flat, 823 size.width, 824 size.height, 825 nJob 826 ); 827 break; 828 } 829 #endif 830 #ifdef ARMOPT 831 case W2XCONV_PROC_HOST_NEON: 832 { 833 filter_NEON_impl 834 ( 835 env, 836 packed_input, 837 packed_output, 838 nInputPlanes, 839 nOutputPlanes, 840 fbiases_flat, 841 weight_flat, 842 size.width, 843 size.height, 844 nJob 845 ); 846 break; 847 } 848 #endif 849 #ifdef PPCOPT 850 case W2XCONV_PROC_HOST_ALTIVEC: 851 { 852 filter_AltiVec_impl 853 ( 854 env, 855 packed_input, 856 packed_output, 857 nInputPlanes, 858 nOutputPlanes, 859 fbiases_flat, 860 weight_flat, 861 size.width, 862 size.height, 863 nJob 864 ); 865 break; 866 } 867 #endif 868 default: 869 { 870 filter_CV(env, packed_input_buf, packed_output_buf, size); 871 break; 872 } 873 } 874 } 875 } 876 877 w2xc_aligned_free(fbiases_flat); 878 w2xc_aligned_free(weight_flat); 879 880 return true; 881 } 882 filter(W2XConv * conv,ComputeEnv * env,Buffer * packed_input_buf,Buffer * packed_output_buf,W2Size const & size)883 bool Model::filter (W2XConv *conv, ComputeEnv *env, Buffer *packed_input_buf, Buffer *packed_output_buf, W2Size const &size) 884 { 885 bool ret; 886 887 bool avx_available = true; 888 bool cl_available = true; 889 bool cuda_available = true; 890 891 if (nOutputPlanes > GPU_VEC_WIDTH) 892 { 893 cl_available = false; 894 cuda_available = false; 895 } 896 897 if (nOutputPlanes == 32 && nInputPlanes == 1) 898 { 899 /* i1 o32 filter */ 900 } 901 else if (nOutputPlanes == 1 && nInputPlanes == 128) 902 { 903 /* i128 o32 filter */ 904 } 905 else if (nOutputPlanes == 32 && nInputPlanes == 3) 906 { 907 /* i3 o32 filter */ 908 } 909 else if (nOutputPlanes == 3 && nInputPlanes == 128) 910 { 911 /* i128 o3 filter */ 912 } 913 else 914 { 915 if (nInputPlanes & 1) 916 { 917 cl_available = false; 918 cuda_available = false; 919 avx_available = false; 920 } 921 922 if (nOutputPlanes & 31) 923 { 924 cl_available = false; 925 cuda_available = false; 926 avx_available = false; 927 } 928 929 if (nInputPlanes == 32 || nInputPlanes == 64 || nInputPlanes == 128) 930 { 931 /* ok */ 932 } 933 else 934 { 935 cuda_available = false; 936 } 937 } 938 939 //printf("%d %d %d\n", 940 // (int)cuda_available, 941 // (int)cl_available, 942 // (int)avx_available); 943 944 const struct W2XConvProcessor *proc = conv->target_processor; 945 946 if ((cl_available && proc->type == W2XCONV_PROC_OPENCL) || 947 (cuda_available && proc->type == W2XCONV_PROC_CUDA) || 948 (avx_available && proc->type == W2XCONV_PROC_HOST)) 949 { 950 ret = filter_AVX_OpenCL(conv, env, packed_input_buf, packed_output_buf, size); 951 } 952 else 953 { 954 ret = filter_CV(env, packed_input_buf, packed_output_buf, size); 955 } 956 957 return ret; 958 } 959 loadModelFromJSONObject(picojson::object & jsonObj)960 bool Model::loadModelFromJSONObject(picojson::object &jsonObj) { 961 962 // nInputPlanes,nOutputPlanes,kernelSize have already set. 963 int matProgress = 0; 964 picojson::array &wOutputPlane = jsonObj["weight"].get<picojson::array>(); 965 966 // setting weight matrices 967 for (auto&& wInputPlaneV : wOutputPlane) 968 { 969 picojson::array &wInputPlane = wInputPlaneV.get<picojson::array>(); 970 971 for (auto&& weightMatV : wInputPlane) 972 { 973 picojson::array &weightMat = weightMatV.get<picojson::array>(); 974 W2Mat writeMatrix(kernelSize, kernelSize, CV_32FC1); 975 976 for (int writingRow = 0; writingRow < kernelSize; writingRow++) 977 { 978 auto& weightMatRowV = weightMat.at(writingRow); 979 picojson::array &weightMatRow = weightMatRowV.get< 980 picojson::array>(); 981 982 for (int index = 0; index < kernelSize; index++) 983 { 984 writeMatrix.ptr<float>(writingRow)[index] = (float) weightMatRow[index].get<double>(); 985 } // for(weightMatRow) (writing 1 row finished) 986 987 } // for(weightMat) (writing 1 matrix finished) 988 989 weights.push_back(std::move(writeMatrix)); 990 matProgress++; 991 } // for(wInputPlane) (writing matrices in set of wInputPlane finished) 992 993 } //for(wOutputPlane) (writing all matrices finished) 994 995 // setting biases 996 picojson::array biasesData = jsonObj["bias"].get<picojson::array>(); 997 998 for (int index = 0; index < nOutputPlanes; index++) 999 { 1000 biases[index] = biasesData[index].get<double>(); 1001 } 1002 1003 return true; 1004 } 1005 1006 #ifdef HAVE_OPENCV filterWorker(std::vector<W2Mat> & inputPlanes_w2,std::vector<W2Mat> & weightMatrices_w2,std::vector<W2Mat> & outputPlanes_w2,unsigned int beginningIndex,unsigned int nWorks)1007 bool Model::filterWorker 1008 ( 1009 std::vector<W2Mat> &inputPlanes_w2, 1010 std::vector<W2Mat> &weightMatrices_w2, 1011 std::vector<W2Mat> &outputPlanes_w2, 1012 unsigned int beginningIndex, 1013 unsigned int nWorks 1014 ) 1015 { 1016 std::vector<cv::Mat> inputPlanes; 1017 std::vector<cv::Mat> weightMatrices; 1018 std::vector<cv::Mat> outputPlanes; 1019 1020 extract_viewlist_to_cvmat(inputPlanes, inputPlanes_w2); 1021 extract_viewlist_to_cvmat(weightMatrices, weightMatrices_w2); 1022 extract_viewlist_to_cvmat(outputPlanes, outputPlanes_w2); 1023 1024 cv::Size ipSize = inputPlanes[0].size(); 1025 // filter processing 1026 // input : inputPlanes 1027 // kernel : weightMatrices 1028 1029 for (int opIndex = beginningIndex; opIndex < (int)(beginningIndex + nWorks); opIndex++) 1030 { 1031 int wMatIndex = nInputPlanes * opIndex; 1032 cv::Mat outputPlane = cv::Mat::zeros(ipSize, CV_32FC1); 1033 cv::Mat &uIntermediatePlane = outputPlane; // all zero matrix 1034 1035 for (int ipIndex = 0; ipIndex < nInputPlanes; ipIndex++) 1036 { 1037 cv::Mat &uInputPlane = inputPlanes[ipIndex]; 1038 cv::Mat &weightMatrix = weightMatrices[wMatIndex + ipIndex]; 1039 cv::Mat filterOutput = cv::Mat::zeros(ipSize, CV_32FC1); 1040 1041 cv::filter2D(uInputPlane, filterOutput, -1, weightMatrix, cv::Point(-1, -1), 0.0, cv::BORDER_REPLICATE); 1042 1043 cv::add(uIntermediatePlane, filterOutput, uIntermediatePlane); 1044 } 1045 1046 cv::add(uIntermediatePlane, biases[opIndex], uIntermediatePlane); 1047 cv::Mat moreThanZero = cv::Mat(ipSize,CV_32FC1,0.0); 1048 cv::Mat lessThanZero = cv::Mat(ipSize,CV_32FC1,0.0); 1049 (cv::max)(uIntermediatePlane, 0.0, moreThanZero); 1050 (cv::min)(uIntermediatePlane, 0.0, lessThanZero); 1051 cv::scaleAdd(lessThanZero, 0.1, moreThanZero, uIntermediatePlane); 1052 uIntermediatePlane.copyTo(outputPlanes[opIndex]); 1053 1054 } // for index 1055 1056 return true; 1057 } 1058 #endif 1059 1060 modelUtility * modelUtility::instance = nullptr; 1061 getInstance()1062 modelUtility& modelUtility::getInstance() 1063 { 1064 if(instance == nullptr) 1065 { 1066 instance = new modelUtility(); 1067 } 1068 return *instance; 1069 } 1070 Model(FILE * binfp)1071 Model::Model(FILE *binfp) 1072 { 1073 uint32_t nInputPlanes, nOutputPlanes; 1074 1075 fread(&nInputPlanes, 4, 1, binfp); 1076 fread(&nOutputPlanes, 4, 1, binfp); 1077 1078 this->nInputPlanes = nInputPlanes; 1079 this->nOutputPlanes = nOutputPlanes; 1080 this->kernelSize = 3; 1081 this->weights.clear(); 1082 this->biases.clear(); 1083 1084 // setting weight matrices 1085 for (uint32_t oi=0; oi<nOutputPlanes; oi++) 1086 { 1087 for (uint32_t ii=0; ii<nInputPlanes; ii++) 1088 { 1089 W2Mat writeMatrix(kernelSize, kernelSize, CV_32FC1); 1090 1091 for (int yi=0; yi<3; yi++) 1092 { 1093 for (int xi=0; xi<3; xi++) 1094 { 1095 double v; 1096 fread(&v, 8, 1, binfp); 1097 writeMatrix.at<float>(yi, xi) = (float) v; 1098 } 1099 } 1100 1101 this->weights.emplace_back(std::move(writeMatrix)); 1102 } 1103 } 1104 1105 for (uint32_t oi = 0; oi < nOutputPlanes; oi++) 1106 { 1107 double v; 1108 fread(&v, 8, 1, binfp); 1109 biases.push_back(v); 1110 } 1111 } 1112 Model(int nInputPlane,int nOutputPlane,const float * coef_list,const float * bias)1113 Model::Model(int nInputPlane, int nOutputPlane, const float *coef_list, const float *bias) 1114 { 1115 this->nInputPlanes = nInputPlane; 1116 this->nOutputPlanes = nOutputPlane; 1117 this->kernelSize = 3; 1118 this->weights.clear(); 1119 this->biases.clear(); 1120 1121 int cur = 0; 1122 1123 // setting weight matrices 1124 for (uint32_t oi = 0; oi < (uint32_t)nOutputPlanes; oi++) 1125 { 1126 for (uint32_t ii = 0; ii < (uint32_t)nInputPlanes; ii++) 1127 { 1128 W2Mat writeMatrix(kernelSize, kernelSize, CV_32FC1); 1129 1130 for (int yi = 0; yi < 3; yi++) 1131 { 1132 for (int xi = 0; xi < 3; xi++) 1133 { 1134 double v = coef_list[cur++]; 1135 writeMatrix.at<float>(yi, xi) = (float) v; 1136 } 1137 } 1138 1139 this->weights.emplace_back(std::move(writeMatrix)); 1140 } 1141 } 1142 1143 for (uint32_t oi = 0; oi < (uint32_t)nOutputPlanes; oi++) 1144 { 1145 double v = bias[oi]; 1146 biases.push_back(v); 1147 } 1148 } generateModelFromJSON(const _tstring & fileName,std::vector<std::unique_ptr<Model>> & models)1149 bool modelUtility::generateModelFromJSON 1150 ( 1151 const _tstring &fileName, 1152 std::vector<std::unique_ptr<Model> > &models 1153 ) 1154 { 1155 _tstring binpath = fileName + _T(".bin"); 1156 FILE *binfp = _tfopen(binpath.c_str(), _T("rb")); 1157 1158 if (binfp) 1159 { 1160 bool need_update = update_test(binpath.c_str(), fileName.c_str()); 1161 1162 if (need_update) 1163 { 1164 fclose(binfp); 1165 binfp = NULL; 1166 } 1167 } 1168 1169 if (binfp) 1170 { 1171 uint32_t nModel; 1172 1173 fread(&nModel, 4, 1, binfp); 1174 1175 for (uint32_t i=0; i<nModel; i++) 1176 { 1177 std::unique_ptr<Model> m = std::unique_ptr<Model>( 1178 new Model(binfp)); 1179 models.push_back(std::move(m)); 1180 } 1181 1182 fclose(binfp); 1183 } 1184 else 1185 { 1186 std::ifstream jsonFile; 1187 1188 jsonFile.open(fileName); 1189 if (!jsonFile.is_open()) 1190 { 1191 std::string fname = _tstr2str(fileName); 1192 std::cerr << "Error : couldn't open " << fname << std::endl; 1193 return false; 1194 } 1195 1196 picojson::value jsonValue; 1197 jsonFile >> jsonValue; 1198 1199 std::string errMsg = picojson::get_last_error(); 1200 1201 if (!errMsg.empty()) 1202 { 1203 std::cerr << "Error : PicoJSON Error : " << errMsg << std::endl; 1204 return false; 1205 } 1206 1207 picojson::array& objectArray = jsonValue.get<picojson::array>(); 1208 1209 for (auto&& obj : objectArray) 1210 { 1211 std::unique_ptr<Model> m = std::unique_ptr<Model>( 1212 new Model(obj.get<picojson::object>())); 1213 models.push_back(std::move(m)); 1214 } 1215 1216 binfp = _tfopen(binpath.c_str(), _T("wb")); 1217 if (binfp) 1218 { 1219 size_t nModel = objectArray.size(); 1220 fwrite(&nModel, 4, 1, binfp); 1221 1222 for (auto&& m : models) 1223 { 1224 uint32_t nInputPlanes = m->getNInputPlanes(); 1225 uint32_t nOutputPlanes = m->getNOutputPlanes(); 1226 1227 fwrite(&nInputPlanes, 4, 1, binfp); 1228 fwrite(&nOutputPlanes, 4, 1, binfp); 1229 1230 std::vector<W2Mat> &weights = m->getWeigts(); 1231 1232 int nw = (int) weights.size(); 1233 1234 for (int wi = 0; wi < nw; wi++) 1235 { 1236 W2Mat &wm = weights[wi]; 1237 double v; 1238 v = wm.at<float>(0,0); 1239 fwrite(&v, 1, 8, binfp); 1240 v = wm.at<float>(0,1); 1241 fwrite(&v, 1, 8, binfp); 1242 v = wm.at<float>(0,2); 1243 fwrite(&v, 1, 8, binfp); 1244 1245 v = wm.at<float>(1,0); 1246 fwrite(&v, 1, 8, binfp); 1247 v = wm.at<float>(1,1); 1248 fwrite(&v, 1, 8, binfp); 1249 v = wm.at<float>(1,2); 1250 fwrite(&v, 1, 8, binfp); 1251 1252 v = wm.at<float>(2,0); 1253 fwrite(&v, 1, 8, binfp); 1254 v = wm.at<float>(2,1); 1255 fwrite(&v, 1, 8, binfp); 1256 v = wm.at<float>(2,2); 1257 fwrite(&v, 1, 8, binfp); 1258 } 1259 1260 std::vector<double> &b = m->getBiases(); 1261 fwrite(&b[0], 8, b.size(), binfp); 1262 } 1263 1264 fclose(binfp); 1265 } 1266 } 1267 return true; 1268 } 1269 generateModelFromMEM(int layer_depth,int num_input_plane,const int * num_map,const float * coef_list,const float * bias,std::vector<std::unique_ptr<Model>> & models)1270 void modelUtility::generateModelFromMEM 1271 ( 1272 int layer_depth, 1273 int num_input_plane, 1274 const int *num_map, // num_map[layer_depth] 1275 const float *coef_list, // coef_list[layer_depth][num_map][3x3] 1276 const float *bias, // bias[layer_depth][num_map] 1277 std::vector<std::unique_ptr<Model> > &models 1278 ) 1279 { 1280 int cur = 0; 1281 models.resize(layer_depth); 1282 1283 models[0] = std::unique_ptr<Model>(new Model(num_input_plane, num_map[0], &coef_list[0], &bias[0])); 1284 1285 cur += num_map[0]; 1286 1287 for (int li = 1; li < layer_depth; li++) 1288 { 1289 models[li] = std::unique_ptr<Model>(new Model(num_map[li - 1], num_map[li], &coef_list[cur * 3 * 3], &bias[cur])); 1290 cur += num_map[li]; 1291 } 1292 } 1293 setNumberOfJobs(int setNJob)1294 bool modelUtility::setNumberOfJobs(int setNJob) 1295 { 1296 if(setNJob < 1) 1297 { 1298 return false; 1299 } 1300 1301 nJob = setNJob; 1302 1303 return true; 1304 }; 1305 getNumberOfJobs()1306 int modelUtility::getNumberOfJobs() 1307 { 1308 return nJob; 1309 } 1310 1311 // for debugging printWeightMatrix()1312 void Model::printWeightMatrix() 1313 { 1314 1315 for (auto&& weightMatrix : weights) 1316 { 1317 //std::cout << weightMatrix << std::endl; 1318 } 1319 1320 } 1321 printBiases()1322 void Model::printBiases() { 1323 1324 for (auto&& bias : biases) 1325 { 1326 std::cout << bias << std::endl; 1327 } 1328 } 1329 } 1330