1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2017, Intel Corporation, all rights reserved. 14// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// Redistribution and use in source and binary forms, with or without modification, 18// are permitted provided that the following conditions are met: 19// 20// * Redistribution's of source code must retain the above copyright notice, 21// this list of conditions and the following disclaimer. 22// 23// * Redistribution's in binary form must reproduce the above copyright notice, 24// this list of conditions and the following disclaimer in the documentation 25// and/or other materials provided with the distribution. 26// 27// * The name of the copyright holders may not be used to endorse or promote products 28// derived from this software without specific prior written permission. 29// 30// This software is provided by the copyright holders and contributors "as is" and 31// any express or implied warranties, including, but not limited to, the implied 32// warranties of merchantability and fitness for a particular purpose are disclaimed. 33// In no event shall the Intel Corporation or contributors be liable for any direct, 34// indirect, incidental, special, exemplary, or consequential damages 35// (including, but not limited to, procurement of substitute goods or services; 36// loss of use, data, or profits; or business interruption) however caused 37// and on any theory of liability, whether in contract, strict liability, 38// or tort (including negligence or otherwise) arising in any way out of 39// the use of this software, even if advised of the possibility of such damage. 40// 41//M*/ 42 43#if defined(cl_khr_fp16) 44#pragma OPENCL EXTENSION cl_khr_fp16 : enable 45#endif 46 47#define KERNEL_ARG_DTYPE float 48#define TYPE_FLOAT 1 49#define TYPE_HALF 2 50 51#if defined(FUSED_CONV_RELU) 52#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope))) 53#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope, 54#elif defined(FUSED_CONV_PRELU) 55#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c]))) 56#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope, 57#elif defined(FUSED_CONV_POWER) 58#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power) 59#define FUSED_ARG KERNEL_ARG_DTYPE power, 60#elif defined(FUSED_CONV_TANH) 61#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x) 62#define FUSED_ARG 63#elif defined(FUSED_CONV_RELU6) 64#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value)) 65#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value, 66#else 67#define ACTIVATION_RELU_FUNCTION(x, c) (x) 68#define FUSED_ARG 69#endif 70 71#ifdef FUSED_CONV_ELTWISE 72#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \ 73 const Dtype _x_ = eltwise_data[(_offset_)] + (_data_); \ 74 (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ 75} while(0) 76#define ELTWISE_DATA_ARG __global Dtype* eltwise_data, 77#else 78#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \ 79 const Dtype _x_ = (_data_); \ 80 (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \ 81} while(0) 82#define ELTWISE_DATA_ARG 83#endif 84 85#if APPLY_BIAS 86#define BIAS_KERNEL_ARG __global Dtype * biases_base, 87#else 88#define BIAS_KERNEL_ARG 89#endif 90 91#define __CAT(x, y) x##y 92#define CAT(x, y) __CAT(x, y) 93#define LOOP0(VAR, STMT) 94#define LOOP1(VAR, STMT) (STMT); (VAR)++; 95#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++; 96#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++; 97#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++; 98#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++; 99#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++; 100#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++; 101#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++; 102#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++; 103#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++; 104#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++; 105#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++; 106#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++; 107#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++; 108#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++; 109#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++; 110#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT)) 111 112#if defined(convolve_simd) || defined(Conv_Interleaved) 113#if TYPE == TYPE_HALF 114#define INT_TYPE ushort 115#define INT_TYPE2 ushort2 116#define INT_TYPE4 ushort4 117#define INT_TYPE8 ushort8 118#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2 119#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4 120#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8 121#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us 122#else 123#define INT_TYPE uint 124#define INT_TYPE2 uint2 125#define INT_TYPE4 uint4 126#define INT_TYPE8 uint8 127#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2 128#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4 129#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8 130#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read 131#endif 132#endif 133 134#ifdef KERNEL_BASIC 135 136__kernel void ConvolveBasic( 137 ELTWISE_DATA_ARG 138 FUSED_ARG 139 __global Dtype* image_data, 140 int image_offset, 141 __global Dtype* kernel_data, 142 int kernel_offset, 143 __global Dtype* bias, 144 const int bias_offset, 145 __global Dtype* convolved_image_base, 146 const int convolved_image_base_offset, 147 const int convolved_image_offset, 148 const ushort input_width, 149 const ushort input_height, 150 const ushort output_width, 151 const ushort output_height, 152 const ushort pad_w, 153 const ushort pad_h 154) 155{ 156 __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset; 157 const int outputX = get_global_id(0); 158 const int outputY = get_global_id(1); 159 const int kernelNum = get_global_id(2) * ZPAR; 160 if (outputX < output_width && outputY < output_height) 161 { 162 Dtype sum[ZPAR]; 163 for (int kern = 0; kern < ZPAR; kern++) 164 { 165 sum[kern] = 0.0f; 166 } 167 const int org_y = outputY * STRIDE_Y - pad_h; 168 const int org_x = outputX * STRIDE_X - pad_w; 169 const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS; 170#if APPLY_BIAS 171 const int biasIndex = bias_offset + kernelNum; 172#endif 173 const int local_image_offset = org_y * input_width + org_x; 174 const int imageSize = input_width * input_height; 175 __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset)); 176 __global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset)); 177 for (int c = 0; c < CHANNELS; c++) 178 { 179 for (int y = 0; y < KERNEL_HEIGHT; y++) 180 { 181 for (int x = 0; x < KERNEL_WIDTH; x++) 182 { 183 int y_ = org_y + y * DILATION_Y; 184 int x_ = org_x + x * DILATION_X; 185 if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width)) 186 { 187 continue; 188 } 189 for (int kern = 0; kern < ZPAR; kern++) 190 { 191 sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x]; 192 } 193 } 194 image_dataPtr += input_width * DILATION_Y; 195 kernel_dataPtr += KERNEL_WIDTH; 196 } 197 image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y; 198 } 199 200 for (int kern = 0; kern < ZPAR; kern++) 201 { 202 if (kernelNum + kern < OUTPUT_Z) 203 { 204 int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX; 205#if APPLY_BIAS 206 ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern); 207#else 208 ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], kernelNum + kern); 209#endif 210 } 211 } 212 } 213} 214 215#elif defined KERNEL_IDLF 216 217// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map. 218// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image. 219// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH 220 221// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break. 222__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE))) 223__attribute__((intel_reqd_sub_group_size(SIMD_SIZE))) 224__kernel void 225convolve_simd( 226 ELTWISE_DATA_ARG 227 FUSED_ARG 228 __global Dtype* inputs, 229 __global Dtype* weights, 230 BIAS_KERNEL_ARG 231 __global Dtype* outputs_base, 232 const int outputs_offset, 233 const ushort input_width, 234 const ushort input_height, 235 const ushort output_width, 236 const ushort output_height) 237{ 238 __global Dtype* outputs = outputs_base + outputs_offset; 239 unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column 240 unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row 241 unsigned int fm = get_global_id(2); // fm = Feature Map = od = Output Depth 242 unsigned int fmg = get_group_id(2); 243 unsigned int lid = get_local_id(2); 244 245 Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f }; 246 247 // find weights address of given neuron (lid is index) 248 unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) * 249 INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid; 250 251 unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS; 252 253 unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE; 254 255 int curr_y = or * STRIDE_Y; 256 int curr_x = oc * STRIDE_X + lid; 257 258 int in_addr = input_batch_offset 259 + (curr_y - INPUT_PAD_H) * INPUT_WIDTH // y tile offset 260 + curr_x - INPUT_PAD_W; // x tile offset 261 262 const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1; 263 264 Dtype in_buf[INVEC_SIZE]; 265 266 for(int kd = 0; kd < INPUT_DEPTH; kd++) 267 { 268#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 269 const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W); 270 int in_offset = in_addr; 271 __attribute__((opencl_unroll_hint(INVEC_SIZE))) 272 for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH) 273 { 274 Dtype input = inputs[clamp(in_offset, 0, in_limit)]; 275 int cy = curr_y + reg; 276 in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input; 277 } 278#else 279 int in_offset = in_addr; 280 __attribute__((opencl_unroll_hint(INVEC_SIZE))) 281 for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH) 282 { 283 in_buf[reg] = inputs[min(in_offset, in_limit)]; 284 } 285#endif 286 287 in_addr += INPUT_PITCH; 288 289#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c)) 290 291 int kr = 0; // kr = Kernel Row 292 LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop. 293 { 294 int kc = 0; // kc = Kernel Column 295 LOOP(KERNEL_WIDTH, kc, 296 { 297 Dtype weight_value = weights[weight_addr]; 298 weight_addr += SIMD_SIZE; 299 for (int br=0; br < OUT_BLOCK_HEIGHT; br++) 300 { 301 for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) 302 { 303 Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X); 304 out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]); 305 } 306 } 307 }); 308 }); 309 } 310 311 fm = fm % ALIGNED_NUM_FILTERS; 312 313#if LEFT_FILTERS > 0 314 if (fm < NUM_FILTERS) 315#endif 316 { 317 unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH; 318 out_addr += or * output_width + oc; 319 // we need this address calculation for biases because we support views and batching 320#if APPLY_BIAS 321 Dtype bias = biases_base[fm]; 322#else 323 Dtype bias = 0; 324#endif 325 326 for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) 327 { 328 if (r + or >= output_height) break; 329 for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) 330 { 331 if (c + oc >= output_width) break; 332 // this does a scattered write to SIMD_SIZE different feature maps, 333 // so that data within one map is contiguous, thus ready for input to next layer. 334 ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm); 335 } 336 } 337 } 338} 339 340#elif defined KERNEL_GEMM_LIKE 341 342#if APPLY_BIAS 343#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i) 344#else 345#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0) 346#endif 347 348#ifdef Conv_Interleaved 349typedef struct float1 { float s0; } float1; 350typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5; 351typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6; 352typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7; 353typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9; 354typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5; 355 float s6; float s7; float s8; float s9;} float10; 356typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5; 357 float s6; float s7; float s8; float s9; float sa;} float11; 358typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5; 359 float s6; float s7; float s8; float s9; float sa; float sb; } float12; 360typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5; 361 float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13; 362typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5; 363 float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14; 364typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5; 365 float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15; 366typedef struct float0 { float s0; } float0; //never used but makes compiler happy. 367 368typedef struct half1 { half s0; } half1; 369typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5; 370typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6; 371typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7; 372typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9; 373typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5; 374 half s6; half s7; half s8; half s9; } half10; 375typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5; 376 half s6; half s7; half s8; half s9; half sa; } half11; 377typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5; 378 half s6; half s7; half s8; half s9; half sa; half sb; } half12; 379typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5; 380 half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13; 381typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5; 382 half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14; 383typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5; 384 half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15; 385typedef struct half0 { half s0; } half0; //never used but makes compiler happy. 386 387#define OUT_PITCH_X output_width 388#define ROW_PITCH input_width 389 390#define GEMM_LIKE_KERNEL_ARGS \ 391 ELTWISE_DATA_ARG \ 392 FUSED_ARG \ 393 const __global Dtype *src0, \ 394 const __global Dtype *src1, \ 395 BIAS_KERNEL_ARG \ 396 __global Dtype *dst_base, \ 397 const int dst_offset, \ 398 const ushort input_width, \ 399 const ushort input_height, \ 400 const ushort output_width, \ 401 const ushort output_height, \ 402 const int out_pitch_y, \ 403 const int out_pitch_z, \ 404 const int aligned_input_size, \ 405 const int slice_pitch 406#endif 407 408#ifdef GEMM_LIKE_CONV_32_1 409////////////////////////////////////////////////////////////////////////////// 410// Conv_Interleaved_32_1_flex 411// 412// Convolution: each workitem computes 1 patch x 32 filters worth of output 413// data. Kernel's inner loop works on a single tile consisting of one 414// row from each patch and the filter data corresponding to that row. Filter 415// matrix is interleaved to reduce GRF bank conflicts. Patches are walked 416// by rows and then by slices. Relies on sub_group extension for block 417// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) 418// by dynamically selecting one of two code paths: one uses TILE_N = 32 and 419// the other uses TILE_N = 8, 16, or 24. 420#define TILE_M 1 421#define TILE_K KERNEL_WIDTH 422#define TILE_N 32 423 424__attribute__((intel_reqd_sub_group_size(8))) 425__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) 426{ 427 __global Dtype *dst = dst_base + dst_offset; 428 const int group_x = get_group_id(0); 429 const int group_y = get_group_id(1); 430 const int global_x = get_global_id(0); 431 const int global_y = get_global_id(1); 432 const int global_z = get_global_id(2); 433 int interleaved_y; 434 int kernel_y; 435 int kernel_idx; 436 437#define DOT_PRODUCT_8( _result, _rowA, colB ) \ 438 { \ 439 _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ 440 _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ 441 _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ 442 _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ 443 _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ 444 _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ 445 _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ 446 _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ 447 } 448 typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; 449 450 // True for all threads if filter_width is multiple of TILE_N 451 // else, true for all but right-most column of threads. 452 if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) 453 { 454 // Result ctile (*dst) is M rows x N columns 455 // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. 456 Dtype8 blockC00 = 0.f; 457 Dtype8 blockC10 = 0.f; 458 Dtype8 blockC20 = 0.f; 459 Dtype8 blockC30 = 0.f; 460 461 // Src0 (patch input) is directly used as atile. 462 // Each work item points to the start of a different patch. 463 // atile is M rows x K columns. 464 int curr_x = ( global_y % output_width ) * STRIDE_X; 465 int curr_y = ( global_y / output_width ) * STRIDE_Y; 466#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 467 int saved_y = curr_y; 468#endif 469 const __global Dtype *src0_read = src0 470 + aligned_input_size * global_z // batch offset 471 + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset 472 + (curr_x - INPUT_PAD_W); // x offset 473 474 // Src1 (filter) is directly used as btile. 475 // It starts at the top of src1 and walks down. 476 // btile is K rows x N columns. 477 const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); 478 479 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. 480 // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch 481 // and KERNEL_WIDTH/2 rows of interleaved filter. 482 int patch_depth = 0; 483 do 484 { 485 int patch_row = 0; 486#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 487 curr_y = saved_y; 488#endif 489 490 do 491 { 492 // Load atile and btile. 493 // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity. 494 // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non 495 // interleaved row is padded with zero to ensure same size as interleaved rows. This 496 // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the 497 // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. 498 // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. 499 // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... 500 // (0, 2) (8, 2) (16, 2) (24, 2) ... ... 501 // ... 502 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; 503 504#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 505 #if KERNEL_WIDTH == 3 506 Dtype_t blockA00 = vload3(0, src0_read); 507 Dtype* pblockA00 = (Dtype*)(&blockA00); 508 #else 509 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; 510 Dtype* pblockA00 = (Dtype*)(&blockA00); 511 #endif 512#else 513 Dtype_t blockA00; 514 Dtype* pblockA00 = (Dtype*)(&blockA00); 515 int pos = 0; 516 LOOP(KERNEL_WIDTH, pos, 517 { 518 if (curr_y >= INPUT_PAD_H && 519 curr_y < input_height + INPUT_PAD_H && 520 curr_x + pos * DILATION_X >= INPUT_PAD_W && 521 curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) 522 pblockA00[pos] = src0_read[pos * DILATION_X]; 523 else 524 pblockA00[pos] = 0; 525 }) 526 curr_y += DILATION_Y; 527#endif 528 src0_read += (ROW_PITCH * DILATION_Y); 529 530 Dtype blockB00[KERNEL_WIDTH*4]; 531 Dtype8* p8BlockB00 = (Dtype8*)blockB00; 532 Dtype4* p4BlockB00 = (Dtype4*)blockB00; 533 Dtype* pBlockB00 = (Dtype* )blockB00; 534 535 interleaved_y = 0; 536 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 537 { 538 p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) ); 539 src1_read += WIDTH1 * 2; 540 } ) 541 if ( kernel_width_is_odd ) 542 { 543 p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) ); 544 src1_read += WIDTH1 * 2; 545 } 546 547 // Perform MADs 548 kernel_idx = 0; 549 interleaved_y = 0; 550 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 551 { 552 kernel_y = interleaved_y * 2; 553 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 554 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 555 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 556 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 557 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 558 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 559 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 560 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 561 } ) 562 kernel_y = interleaved_y * 2; 563 if ( kernel_width_is_odd ) 564 { 565 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 566 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 567 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 568 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 569 } 570 } 571 572 //while( ++patch_row < 1 ); //debug 573 while( ++patch_row < KERNEL_HEIGHT ); 574 575 // reset to start of next slice of patch 576 src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); 577 } 578 //while ( ++patch_depth < 1 ); //debug 579 while ( ++patch_depth < INPUT_DEPTH ); 580 581 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: 582 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. 583 int out_offset = global_z * out_pitch_z // batch offset 584 + ( group_x * TILE_N ) * out_pitch_y // channel offset 585 + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset 586 + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset 587 588 __global Dtype *out = dst + out_offset; 589#if APPLY_BIAS 590 Dtype bias[4]; 591 Dtype4 *bias_vec; 592 bias_vec = (Dtype4*)bias; 593 *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); 594 if (group_x > 0xFFFFFFFEul) { 595 dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; 596 } 597#else 598 const Dtype bias[4] = {0, 0, 0, 0}; 599#endif 600 if (global_y * TILE_M < output_width * output_height ) 601 { 602 for (int i = 0; i < 8; i++) 603 { 604 ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); 605 ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + 8 + i); 606 ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + 16 + i); 607 ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + 24 + i); 608 } 609 } 610 } 611#if TILE_N_LAST > 0 612 else 613 { 614 615 // Result ctile (*dst) is M rows x N columns 616 // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. 617 int i = 0; 618 Dtype8 blockC[TILE_N_LAST_DIV8]; 619 LOOP(TILE_N_LAST_DIV8, i, 620 { 621 blockC[i] = 0.f; 622 } ) 623 624 // Src0 (patch input) is directly used as atile. 625 // Each work item points to the start of a different patch. 626 // atile is M rows x K columns. 627 int curr_x = ( global_y % output_width ) * STRIDE_X; 628 int curr_y = ( global_y / output_width ) * STRIDE_Y; 629#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 630 int saved_y = curr_y; 631#endif 632 const __global Dtype *src0_read = src0 633 + aligned_input_size * global_z // batch offset 634 + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset 635 + (curr_x - INPUT_PAD_W); // x offset 636 637 // Src1 (filter) is directly used as btile. 638 // It starts at the top of src1 and walks down. 639 // btile is K rows x N columns. 640 const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); 641 642 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. 643 // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch 644 // and KERNEL_WIDTH/2 rows of interleaved filter. 645 int patch_depth = 0; 646 do 647 { 648 int patch_row = 0; 649#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 650 curr_y = saved_y; 651#endif 652 do 653 { 654 // Load atile and interleaved btile. 655 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; 656#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 657 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; 658 Dtype* pblockA00 = (Dtype*)(&blockA00); 659#else 660 Dtype_t blockA00; 661 Dtype* pblockA00 = (Dtype*)(&blockA00); 662 int pos = 0; 663 LOOP(KERNEL_WIDTH, pos, 664 { 665 if (curr_y >= INPUT_PAD_H && 666 curr_y < input_height + INPUT_PAD_H && 667 curr_x + pos * DILATION_X >= INPUT_PAD_W && 668 curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) 669 pblockA00[pos] = src0_read[pos * DILATION_X]; 670 else 671 pblockA00[pos] = 0; 672 }) 673 curr_y += DILATION_Y; 674#endif 675 src0_read += (ROW_PITCH * DILATION_Y); 676 Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; 677 678 interleaved_y = 0; 679 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 680 { 681#if TILE_N_LAST_DIV8 == 1 682 Dtype2* p2BlockB = (Dtype2* )blockB; 683 p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); 684#elif TILE_N_LAST_DIV8 == 2 685 Dtype4* p4BlockB = (Dtype4* )blockB; 686 p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); 687#elif TILE_N_LAST_DIV8 == 3 688 //TODO: broken. No block_read6 689 Dtype6* p6BlockB = (Dtype6* )blockB; 690 (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); 691 (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) ); 692#endif 693 src1_read += WIDTH1 * 2; 694 } ) 695 if ( kernel_width_is_odd ) 696 { 697#if TILE_N_LAST_DIV8 == 1 698 Dtype* pBlockB = (Dtype* )blockB; 699 pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) ); 700#elif TILE_N_LAST_DIV8 == 2 701 Dtype2* p2BlockB = (Dtype2* )blockB; 702 p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); 703#elif TILE_N_LAST_DIV8 == 3 704 Dtype3* p3BlockB = (Dtype3* )blockB; 705 p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); 706 p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) ); 707#endif 708 src1_read += WIDTH1 * 2; 709 } 710 711 // Perform MADs 712 Dtype* pBlockB = (Dtype*)blockB; 713 kernel_idx = 0; 714 interleaved_y = 0; 715 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 716 { 717 kernel_y = interleaved_y * 2; 718 DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; 719 DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; 720#if TILE_N_LAST_DIV8 >= 2 721 DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; 722 DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; 723#if TILE_N_LAST_DIV8 >= 3 724 DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; 725 DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; 726#endif 727#endif 728 } ) 729 kernel_y = interleaved_y * 2; 730 if ( kernel_width_is_odd ) 731 { 732 DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; 733#if TILE_N_LAST_DIV8 >= 2 734 DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; 735#if TILE_N_LAST_DIV8 >= 3 736 DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; 737#endif 738#endif 739 } 740 } 741 742 //while( ++patch_row < 1 ); //debug 743 while( ++patch_row < KERNEL_HEIGHT ); 744 745 // reset to start of next slice of patch 746 src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); 747 } 748 //while ( ++patch_depth < 1 ); //debug 749 while ( ++patch_depth < INPUT_DEPTH ); 750 751 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: 752 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. 753 int out_offset = global_z * out_pitch_z // batch offset 754 + ( group_x * TILE_N ) * out_pitch_y // channel offset 755 + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset 756 + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset 757 __global Dtype *out = dst + out_offset; 758#if APPLY_BIAS 759 Dtype bias[4]; 760 Dtype4 *bias_vec; 761 bias_vec = (Dtype4*)bias; 762 *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); 763 if (group_x > 0xFFFFFFFEul) { 764 dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; 765 } 766#else 767 const Dtype bias[4] = {0, 0, 0, 0}; 768#endif 769 770 if (global_y * TILE_M < output_width * output_height ) 771 { 772 for (int i = 0; i < 8; i++) 773 { 774 if ( TILE_N_LAST_DIV8 > 0 ) 775 { 776 ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); 777 } 778 if ( TILE_N_LAST_DIV8 > 1 ) 779 { 780 ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8); 781 } 782 if ( TILE_N_LAST_DIV8 > 2 ) 783 { 784 ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16); 785 } 786 if ( TILE_N_LAST_DIV8 > 3 ) 787 { 788 ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24); 789 } 790 } 791 } 792 } 793#endif 794} 795#endif 796#ifdef GEMM_LIKE_CONV_32_2 797 798////////////////////////////////////////////////////////////////////////////// 799// Conv_Interleaved_32_2_flex 800// 801// Convolution: each workitem computes 1 patch x 32 filters worth of output 802// data. Kernel's inner loop works on a single tile consisting of one 803// row from each patch and the filter data corresponding to that row. Filter 804// matrix is interleaved to reduce GRF bank conflicts. Patches are walked 805// by rows and then by slices. Relies on sub_group extension for block 806// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N) 807// by dynamically selecting one of two code paths: one uses TILE_N = 32 and 808// the other uses TILE_N = 8, 16, or 24. 809#define TILE_M 2 810#define TILE_K KERNEL_WIDTH 811#define TILE_N 32 812 813__attribute__((intel_reqd_sub_group_size(8))) 814__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) 815{ 816 __global Dtype *dst = dst_base + dst_offset; 817 const int group_x = get_group_id(0); 818 const int group_y = get_group_id(1); 819 const int global_x = get_global_id(0); 820 const int global_y = get_global_id(1); 821 const int global_z = get_global_id(2); 822 int interleaved_y; 823 int kernel_y; 824 int kernel_idx; 825 826#define DOT_PRODUCT_8( _result, _rowA, colB ) \ 827 { \ 828 _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ 829 _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ 830 _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ 831 _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ 832 _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ 833 _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ 834 _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ 835 _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ 836 } 837 typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; 838 839 // True for all threads if filter_width is multiple of TILE_N 840 // else, true for all but right-most column of threads. 841 if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N ) 842 { 843 // Result ctile (*dst) is M rows x N columns 844 // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. 845 Dtype8 blockC00 = 0.f; 846 Dtype8 blockC10 = 0.f; 847 Dtype8 blockC20 = 0.f; 848 Dtype8 blockC30 = 0.f; 849 Dtype8 blockC01 = 0.f; 850 Dtype8 blockC11 = 0.f; 851 Dtype8 blockC21 = 0.f; 852 Dtype8 blockC31 = 0.f; 853 854 // Src0 (patch input) is directly used as atile. 855 // Each work item points to the start of a different patch. 856 // atile is M rows x K columns. 857 int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; 858 int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; 859 int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; 860 int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; 861#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 862 int saved_y0 = curr_y0; 863 int saved_y1 = curr_y1; 864#endif 865 const __global Dtype *src0_read0 = src0 866 + aligned_input_size * global_z // batch offset 867 + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset 868 + curr_x0 - INPUT_PAD_W; // x offset 869 const __global Dtype *src0_read1 = src0 870 + aligned_input_size * global_z // batch offset 871 + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset 872 + curr_x1 - INPUT_PAD_W; // x offset 873 874 // Src1 (filter) is directly used as btile. 875 // It starts at the top of src1 and walks down. 876 // btile is K rows x N columns. 877 const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); 878 879 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. 880 // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch 881 // and KERNEL_WIDTH/2 rows of interleaved filter. 882 int patch_depth = 0; 883 do 884 { 885 int patch_row = 0; 886 do 887 { 888 // Load atile and btile. 889 // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity. 890 // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non 891 // interleaved row is padded with zero to ensure same size as interleaved rows. This 892 // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the 893 // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. 894 // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. 895 // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... 896 // (0, 2) (8, 2) (16, 2) (24, 2) ... ... 897 // ... 898 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; 899#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 900 #if KERNEL_WIDTH == 3 901 Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH; 902 Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH; 903 Dtype* pblockA00 = (Dtype*)(&blockA00); 904 Dtype* pblockA01 = (Dtype*)(&blockA01); 905 #else 906 Dtype_t blockA00 = { (Dtype)0.f }; 907 Dtype_t blockA01 = { (Dtype)0.f }; 908 Dtype* pblockA00 = (Dtype*)(&blockA00); 909 Dtype* pblockA01 = (Dtype*)(&blockA01); 910 int pos = 0; 911 LOOP(KERNEL_WIDTH, pos, 912 { 913 if (curr_x0 + pos < input_width) 914 pblockA00[pos] = src0_read0[pos]; 915 916 if (curr_x1 + pos < input_width) 917 pblockA01[pos] = src0_read1[pos]; 918 }) 919 src0_read0 += ROW_PITCH; 920 src0_read1 += ROW_PITCH; 921 #endif 922#else 923 Dtype_t blockA00; 924 Dtype* pblockA00 = (Dtype*)(&blockA00); 925 int pos = 0; 926 LOOP(KERNEL_WIDTH, pos, 927 { 928 if (curr_y0 >= INPUT_PAD_H && 929 curr_y0 < input_height + INPUT_PAD_H && 930 curr_x0 + pos * DILATION_X >= INPUT_PAD_W && 931 curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) 932 pblockA00[pos] = src0_read0[pos * DILATION_X]; 933 else 934 pblockA00[pos] = 0; 935 }) 936 curr_y0 += DILATION_Y; 937 Dtype_t blockA01; 938 Dtype* pblockA01 = (Dtype*)(&blockA01); 939 pos = 0; 940 LOOP(KERNEL_WIDTH, pos, 941 { 942 if (curr_y1 >= INPUT_PAD_H && 943 curr_y1 < input_height + INPUT_PAD_H && 944 curr_x1 + pos * DILATION_X >= INPUT_PAD_W && 945 curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) 946 pblockA01[pos] = src0_read1[pos * DILATION_X]; 947 else 948 pblockA01[pos] = 0; 949 }) 950 curr_y1 += DILATION_Y; 951 src0_read0 += (ROW_PITCH * DILATION_Y); 952 src0_read1 += (ROW_PITCH * DILATION_Y); 953#endif 954 Dtype blockB00[KERNEL_WIDTH*4]; 955 Dtype8* p8BlockB00 = (Dtype8*)blockB00; 956 Dtype4* p4BlockB00 = (Dtype4*)blockB00; 957 Dtype* pBlockB00 = (Dtype* )blockB00; 958 959 interleaved_y = 0; 960 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 961 { 962 p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) ); 963 src1_read += WIDTH1 * 2; 964 } ) 965 if ( kernel_width_is_odd ) 966 { 967 p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); 968 src1_read += WIDTH1 * 2; 969 } 970 // Perform MADs 971 kernel_idx = 0; 972 interleaved_y = 0; 973 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 974 { 975 kernel_y = interleaved_y * 2; 976 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); 977 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 978 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); 979 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 980 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); 981 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 982 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); 983 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 984 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); 985 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 986 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); 987 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 988 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); 989 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 990 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); 991 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 992 } ) 993 if ( kernel_width_is_odd ) 994 { 995 kernel_y = interleaved_y * 2; 996 DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); 997 DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 998 DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); 999 DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 1000 DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); 1001 DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 1002 DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); 1003 DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 1004 } 1005 } 1006 1007 //while( ++patch_row < 1 ); //debug 1008 while( ++patch_row < KERNEL_HEIGHT ); 1009#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 1010 curr_y0 = saved_y0; 1011 curr_y1 = saved_y1; 1012#endif 1013 // reset to start of next slice of patch 1014 src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); 1015 src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); 1016 } 1017 //while ( ++patch_depth < 1 ); //debug 1018 while ( ++patch_depth < INPUT_DEPTH ); 1019 1020 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: 1021 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. 1022 int out0_offset = global_z * out_pitch_z // batch offset 1023 + ( group_x * TILE_N ) * out_pitch_y // channel offset 1024 + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset 1025 + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset 1026 int out1_offset = global_z * out_pitch_z // batch offset 1027 + ( group_x * TILE_N ) * out_pitch_y // channel offset 1028 + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset 1029 + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset 1030 1031#if APPLY_BIAS 1032 Dtype bias[4]; 1033 Dtype4 *bias_vec; 1034 bias_vec = (Dtype4*)bias; 1035 *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); 1036 if (group_x > 0xFFFFFFFEul) { 1037 dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; 1038 } 1039#else 1040 const Dtype bias[4] = {0, 0, 0, 0}; 1041#endif 1042 1043 if( global_y * TILE_M < output_width * output_height ) 1044 { 1045 for( int i = 0; i < 8; i++ ) 1046 { 1047 ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); 1048 ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8); 1049 ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16); 1050 ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24); 1051 } 1052 } 1053 if( global_y * TILE_M + 1 < output_width * output_height ) 1054 { 1055 for( int i = 0; i < 8; i++ ) 1056 { 1057 ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); 1058 ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8); 1059 ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16); 1060 ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24); 1061 } 1062 } 1063 } 1064#if TILE_N_LAST > 0 1065 else 1066 { 1067 1068 // Result ctile (*dst) is M rows x N columns 1069 // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. 1070 int i = 0; 1071 Dtype8 blockC0[TILE_N_LAST_DIV8]; 1072 Dtype8 blockC1[TILE_N_LAST_DIV8]; 1073 LOOP(TILE_N_LAST_DIV8, i, 1074 { 1075 blockC0[i] = 0.f; 1076 blockC1[i] = 0.f; 1077 } ) 1078 1079 // Src0 (patch input) is directly used as atile. 1080 // Each work item points to the start of a different patch. 1081 // atile is M rows x K columns. 1082 int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; 1083 int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; 1084 int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; 1085 int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; 1086#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 1087 int saved_y0 = curr_y0; 1088 int saved_y1 = curr_y1; 1089#endif 1090 const __global Dtype *src0_read0 = src0 1091 + aligned_input_size * global_z // batch offset 1092 + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset 1093 + curr_x0 - INPUT_PAD_W; // x offset 1094 const __global Dtype *src0_read1 = src0 1095 + aligned_input_size * global_z // batch offset 1096 + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset 1097 + curr_x1 - INPUT_PAD_W; // x offset 1098 1099 // Src1 (filter) is directly used as btile. 1100 // It starts at the top of src1 and walks down. 1101 // btile is K rows x N columns. 1102 const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); 1103 1104 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. 1105 // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch 1106 // and KERNEL_WIDTH/2 rows of interleaved filter. 1107 int patch_depth = 0; 1108 do 1109 { 1110 int patch_row = 0; 1111 do 1112 { 1113 // Load atile and interleaved btile. 1114 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; 1115#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 1116 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; 1117 Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; 1118 Dtype* pblockA00 = (Dtype*)(&blockA00); 1119 Dtype* pblockA01 = (Dtype*)(&blockA01); 1120#else 1121 Dtype_t blockA00; 1122 Dtype* pblockA00 = (Dtype*)(&blockA00); 1123 int pos = 0; 1124 LOOP(KERNEL_WIDTH, pos, 1125 { 1126 if (curr_y0 >= INPUT_PAD_H && 1127 curr_y0 < input_height + INPUT_PAD_H && 1128 curr_x0 + pos * DILATION_X >= INPUT_PAD_W && 1129 curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) 1130 pblockA00[pos] = src0_read0[pos * DILATION_X]; 1131 else 1132 pblockA00[pos] = 0; 1133 }) 1134 curr_y0 += DILATION_Y; 1135 Dtype_t blockA01; 1136 Dtype* pblockA01 = (Dtype*)(&blockA01); 1137 pos = 0; 1138 LOOP(KERNEL_WIDTH, pos, 1139 { 1140 if (curr_y1 >= INPUT_PAD_H && 1141 curr_y1 < input_height + INPUT_PAD_H && 1142 curr_x1 + pos * DILATION_X >= INPUT_PAD_W && 1143 curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) 1144 pblockA01[pos] = src0_read1[pos * DILATION_X]; 1145 else 1146 pblockA01[pos] = 0; 1147 }) 1148 curr_y1 += DILATION_Y; 1149 src0_read0 += (ROW_PITCH * DILATION_Y); 1150 src0_read1 += (ROW_PITCH * DILATION_Y); 1151#endif 1152 Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8]; 1153 1154 interleaved_y = 0; 1155 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 1156 { 1157#if TILE_N_LAST_DIV8 == 1 1158 Dtype2* p2BlockB = (Dtype2* )blockB; 1159 p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); 1160#elif TILE_N_LAST_DIV8 == 2 1161 Dtype4* p4BlockB = (Dtype4* )blockB; 1162 p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); 1163#elif TILE_N_LAST_DIV8 == 3 1164 //TODO: broken. No block_read6 1165 Dtype6* p6BlockB = (Dtype6* )blockB; 1166 (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); 1167 (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) ); 1168#endif 1169 src1_read += WIDTH1 * 2; 1170 } ) 1171 if ( kernel_width_is_odd ) 1172 { 1173#if TILE_N_LAST_DIV8 == 1 1174 Dtype* pBlockB = (Dtype* )blockB; 1175 pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) ); 1176#elif TILE_N_LAST_DIV8 == 2 1177 Dtype2* p2BlockB = (Dtype2* )blockB; 1178 p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); 1179#elif TILE_N_LAST_DIV8 == 3 1180 Dtype3* p3BlockB = (Dtype3* )blockB; 1181 p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); 1182 p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) ); 1183#endif 1184 src1_read += WIDTH1 * 2; 1185 } 1186 1187 // Perform MADs 1188 Dtype* pBlockB = (Dtype*)blockB; 1189 kernel_idx = 0; 1190 interleaved_y = 0; 1191 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 1192 { 1193 kernel_y = interleaved_y * 2; 1194 DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); 1195 DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; 1196 DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); 1197 DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; 1198#if TILE_N_LAST_DIV8 >= 2 1199 DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); 1200 DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; 1201 DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); 1202 DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; 1203#if TILE_N_LAST_DIV8 >= 3 1204 DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); 1205 DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++; 1206 DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); 1207 DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++; 1208#endif 1209#endif 1210 } ) 1211 kernel_y = interleaved_y * 2; 1212 if ( kernel_width_is_odd ) 1213 { 1214 DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); 1215 DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; 1216#if TILE_N_LAST_DIV8 >= 2 1217 DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); 1218 DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; 1219#if TILE_N_LAST_DIV8 >= 3 1220 DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); 1221 DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++; 1222#endif 1223#endif 1224 } 1225 } 1226 1227 //while( ++patch_row < 1 ); //debug 1228 while( ++patch_row < KERNEL_HEIGHT ); 1229#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 1230 curr_y0 = saved_y0; 1231 curr_y1 = saved_y1; 1232#endif 1233 // reset to start of next slice of patch 1234 src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); 1235 src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); 1236 } 1237 //while ( ++patch_depth < 1 ); //debug 1238 while ( ++patch_depth < INPUT_DEPTH ); 1239 1240 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: 1241 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. 1242 int out0_offset = global_z * out_pitch_z // batch offset 1243 + ( group_x * TILE_N ) * out_pitch_y // channel offset 1244 + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset 1245 + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset 1246 int out1_offset = global_z * out_pitch_z // batch offset 1247 + ( group_x * TILE_N ) * out_pitch_y // channel offset 1248 + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset 1249 + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset 1250 __global Dtype *out1 = dst + out1_offset; 1251 1252#if APPLY_BIAS 1253 Dtype bias[4]; 1254 Dtype4 *bias_vec; 1255 bias_vec = (Dtype4*)bias; 1256 *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N)); 1257 if (group_x > 0xFFFFFFFEul) { 1258 dst[0] = bias[0] + bias[1] + bias[2] + bias[3]; 1259 } 1260#else 1261 const Dtype bias[4] = {0, 0, 0, 0}; 1262#endif 1263 if( global_y * TILE_M < output_width * output_height ) 1264 { 1265 for( int i = 0; i < 8; i++ ) 1266 { 1267 if ( TILE_N_LAST_DIV8 > 0 ) 1268 { 1269 ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); 1270 } 1271 if ( TILE_N_LAST_DIV8 > 1 ) 1272 { 1273 ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8); 1274 } 1275 if ( TILE_N_LAST_DIV8 > 2 ) 1276 { 1277 ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16); 1278 } 1279 if ( TILE_N_LAST_DIV8 > 3 ) 1280 { 1281 ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24); 1282 } 1283 } 1284 } 1285 if( global_y * TILE_M + 1 < output_width * output_height ) 1286 { 1287 for( int i = 0; i < 8; i++ ) 1288 { 1289 if ( TILE_N_LAST_DIV8 > 0 ) 1290 { 1291 ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); 1292 } 1293 if ( TILE_N_LAST_DIV8 > 1 ) 1294 { 1295 ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8); 1296 } 1297 if ( TILE_N_LAST_DIV8 > 2 ) 1298 { 1299 ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16); 1300 } 1301 if ( TILE_N_LAST_DIV8 > 3 ) 1302 { 1303 ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24); 1304 } 1305 } 1306 } 1307 } 1308#endif 1309} 1310#endif 1311 1312#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16) 1313#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\ 1314 if (global_y * TILE_M < output_width * output_height ) \ 1315 { \ 1316 if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\ 1317 for (int i = 0; i < 16; i++) \ 1318 { \ 1319 ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \ 1320 ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \ 1321 } \ 1322 } \ 1323 else if( ( OUT_DEPTH % 16 ) == 0 ) { \ 1324 if ( ( global_x + 1 ) < get_global_size(0) ) { \ 1325 for ( int i = 0; i < 16; i++ ) \ 1326 { \ 1327 ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \ 1328 ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \ 1329 } \ 1330 } \ 1331 else { \ 1332 for (int i = 0; i < 16; i++) \ 1333 { \ 1334 ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \ 1335 } \ 1336 } \ 1337 } \ 1338 else { \ 1339 if ( ( global_x + 1 ) < get_global_size(0) ) \ 1340 { \ 1341 for ( int i = 0; i < 16; i++ ) \ 1342 { \ 1343 ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \ 1344 ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \ 1345 } \ 1346 } \ 1347 else { \ 1348 if ( (OUT_DEPTH % TILE_N) > 16 ) { \ 1349 for (int i = 0; i < 16 ; i++) \ 1350 { \ 1351 ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \ 1352 } \ 1353 for (int i = 0; i < OUT_DEPTH % 16 ; i++) \ 1354 { \ 1355 ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \ 1356 } \ 1357 } \ 1358 else { \ 1359 for (int i = 0; i < OUT_DEPTH % 16 ; i++) \ 1360 { \ 1361 ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \ 1362 } \ 1363 } \ 1364 } \ 1365 } \ 1366 } \ 1367 }while(0) 1368#endif 1369 1370#ifdef GEMM_LIKE_CONV_32_1_SIMD16 1371#define TILE_M 1 1372#define TILE_K KERNEL_WIDTH 1373#define TILE_N 32 1374 1375__attribute__((intel_reqd_sub_group_size(16))) 1376__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) 1377{ 1378 __global Dtype *dst = dst_base + dst_offset; 1379 const int group_x = get_group_id(0); 1380 const int group_y = get_group_id(1); 1381 const int global_x = get_global_id(0); 1382 const int global_y = get_global_id(1); 1383 const int global_z = get_global_id(2); 1384 int interleaved_y; 1385 int kernel_y; 1386 int kernel_idx; 1387 1388 // Result ctile (*dst) is M rows x N columns 1389 // LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile. 1390 Dtype16 blockC00 = 0.f; 1391 Dtype16 blockC10 = 0.f; 1392 1393 // Src0 (patch input) is directly used as atile. 1394 // Each work item points to the start of a different patch. 1395 // atile is M rows x K columns. 1396 int curr_x = ( global_y % output_width ) * STRIDE_X; 1397 int curr_y = ( global_y / output_width ) * STRIDE_Y; 1398#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 1399 int saved_y = curr_y; 1400#endif 1401 const __global Dtype *src0_read = src0 1402 + aligned_input_size * global_z // batch offset 1403 + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset 1404 + curr_x - INPUT_PAD_W; // x offset 1405 const __global Dtype *src0_read_orig = src0_read; 1406 1407 // Src1 (filter) is directly used as btile. 1408 // It starts at the top of src1 and walks down. 1409 // btile is K rows x N columns. 1410 const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 ); 1411 1412#define DOT_PRODUCT_16( _result, _rowA, colB ) \ 1413 { \ 1414 _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ 1415 _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ 1416 _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ 1417 _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ 1418 _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ 1419 _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ 1420 _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ 1421 _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ 1422 _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \ 1423 _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \ 1424 _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \ 1425 _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \ 1426 _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \ 1427 _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \ 1428 _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \ 1429 _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \ 1430 } 1431 typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; 1432 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. 1433 // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch 1434 // and KERNEL_WIDTH/2 rows of interleaved filter. 1435 int patch_depth = 0; 1436 __attribute__((opencl_unroll_hint(1))) 1437 do 1438 { 1439 int patch_row = 0; 1440#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 1441 curr_y = saved_y; 1442#endif 1443 __attribute__((opencl_unroll_hint(1))) 1444 do 1445 { 1446 // Load atile and btile. 1447 // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype16 granularity. 1448 // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non 1449 // interleaved row is padded with zero to ensure same size as interleaved rows. This 1450 // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the 1451 // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. 1452 // (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ... 1453 // (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ... 1454 // (0, 2) (16, 2) (32, 2) (48, 2) ... ... 1455 // ... 1456 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; 1457 1458#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 1459 #if KERNEL_WIDTH == 3 1460 Dtype_t blockA00 = vload3(0, src0_read); 1461 Dtype* pblockA00 = (Dtype*)(&blockA00); 1462 #else 1463 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ]; 1464 Dtype* pblockA00 = (Dtype*)(&blockA00); 1465 #endif 1466#else 1467 Dtype_t blockA00; 1468 Dtype* pblockA00 = (Dtype*)(&blockA00); 1469 int pos = 0; 1470 LOOP(KERNEL_WIDTH, pos, 1471 { 1472 if (curr_y >= INPUT_PAD_H && 1473 curr_y < input_height + INPUT_PAD_H && 1474 curr_x + pos * DILATION_X >= INPUT_PAD_W && 1475 curr_x + pos * DILATION_X < input_width + INPUT_PAD_W) 1476 pblockA00[pos] = src0_read[pos * DILATION_X]; 1477 else 1478 pblockA00[pos] = 0; 1479 }) 1480 curr_y += DILATION_Y; 1481#endif 1482 src0_read += ROW_PITCH * DILATION_Y; 1483 INT_TYPE blockB00[KERNEL_WIDTH * 2]; 1484 INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00; 1485 INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00; 1486 Dtype* pBlockB00 = (Dtype*)blockB00; 1487 interleaved_y = 0; 1488 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 1489 { 1490 p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ); 1491 src1_read += WIDTH1 * 2; 1492 } ) 1493 if ( kernel_width_is_odd ) 1494 { 1495 p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ); 1496 src1_read += WIDTH1 * 2; 1497 } 1498 1499 // Perform MADs 1500 kernel_idx = 0; 1501 interleaved_y = 0; 1502 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 1503 { 1504 kernel_y = interleaved_y * 2; 1505 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 1506 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 1507 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 1508 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 1509 } ) 1510 if ( kernel_width_is_odd ) 1511 { 1512 kernel_y = interleaved_y * 2; 1513 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 1514 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 1515 } 1516 } 1517 1518 //while( ++patch_row < 1 ); //debug 1519 while( ++patch_row < KERNEL_HEIGHT ); 1520 1521 // reset to start of next slice of patch 1522 src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); 1523 } 1524 //while ( ++patch_depth < 1 ); //debug 1525 while ( ++patch_depth < INPUT_DEPTH ); 1526 1527 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: 1528 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. 1529 int out_offset = global_z * out_pitch_z // batch offset 1530 + ( group_x * TILE_N ) * out_pitch_y // channel offset 1531 + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset 1532 + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset 1533 __global Dtype *out = dst + out_offset; 1534 1535#if APPLY_BIAS 1536 Dtype bias[2]; 1537 Dtype2 *bias_vec; 1538 bias_vec = (Dtype2*)bias; 1539 *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N)); 1540 if (group_x > 0xFFFFFFFEul) { 1541 dst[0] = bias[0] + bias[1]; 1542 } 1543#else 1544 const Dtype bias[2] = {0, 0}; 1545#endif 1546 INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0); 1547} 1548#endif 1549 1550#ifdef GEMM_LIKE_CONV_32_2_SIMD16 1551 1552////////////////////////////////////////////////////////////////////////////// 1553// Conv_Interleaved_32_2_SIMD16 1554// 1555// Convolution: each workitem computes 1 patch x 32 filters worth of output 1556// data. 1557#define TILE_M 2 1558#define TILE_K KERNEL_WIDTH 1559#define TILE_N 32 1560 1561__attribute__((intel_reqd_sub_group_size(16))) 1562__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS) 1563{ 1564 __global Dtype *dst = dst_base + dst_offset; 1565 const int group_x = get_group_id(0); 1566 const int group_y = get_group_id(1); 1567 const int global_x = get_global_id(0); 1568 const int global_y = get_global_id(1); 1569 const int global_z = get_global_id(2); 1570 int interleaved_y; 1571 int kernel_y; 1572 int kernel_idx; 1573#define DOT_PRODUCT_16( _result, _rowA, colB ) \ 1574 { \ 1575 _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ 1576 _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ 1577 _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ 1578 _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ 1579 _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ 1580 _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ 1581 _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ 1582 _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ 1583 _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \ 1584 _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \ 1585 _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \ 1586 _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \ 1587 _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \ 1588 _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \ 1589 _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \ 1590 _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \ 1591 } 1592 typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t; 1593 1594 // True for all threads if filter_width is multiple of TILE_N 1595 // else, true for all but right-most column of threads. 1596 { 1597 // Result ctile (*dst) is M rows x N columns 1598 // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. 1599 Dtype16 blockC00 = 0.f; 1600 Dtype16 blockC10 = 0.f; 1601 Dtype16 blockC01 = 0.f; 1602 Dtype16 blockC11 = 0.f; 1603 1604 // Src0 (patch input) is directly used as atile. 1605 // Each work item points to the start of a different patch. 1606 // atile is M rows x K columns. 1607 int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X; 1608 int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X; 1609 int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y; 1610 int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y; 1611#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 1612 int saved_y0 = curr_y0; 1613 int saved_y1 = curr_y1; 1614#endif 1615 const __global Dtype *src0_read0 = src0 1616 + aligned_input_size * global_z // batch offset 1617 + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset 1618 + curr_x0 - INPUT_PAD_W; // x offset 1619 const __global Dtype *src0_read1 = src0 1620 + aligned_input_size * global_z // batch offset 1621 + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset 1622 + curr_x1 - INPUT_PAD_W; // x offset 1623 1624 // Src1 (filter) is directly used as btile. 1625 // It starts at the top of src1 and walks down. 1626 // btile is K rows x N columns. 1627 const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2); 1628 1629 // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. 1630 // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch 1631 // and KERNEL_WIDTH/2 rows of interleaved filter. 1632 int patch_depth = 0; 1633 do 1634 { 1635 int patch_row = 0; 1636 do 1637 { 1638 // Load atile and btile. 1639 // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity. 1640 // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non 1641 // interleaved row is padded with zero to ensure same size as interleaved rows. This 1642 // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the 1643 // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3. 1644 // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. 1645 // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... 1646 // (0, 2) (8, 2) (16, 2) (24, 2) ... ... 1647 // ... 1648 const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1; 1649#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0 1650 Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH; 1651 Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH; 1652 Dtype* pblockA00 = (Dtype*)(&blockA00); 1653 Dtype* pblockA01 = (Dtype*)(&blockA01); 1654#else 1655 Dtype_t blockA00; 1656 Dtype* pblockA00 = (Dtype*)(&blockA00); 1657 int pos = 0; 1658 LOOP(KERNEL_WIDTH, pos, 1659 { 1660 if (curr_y0 >= INPUT_PAD_H && 1661 curr_y0 < input_height + INPUT_PAD_H && 1662 curr_x0 + pos * DILATION_X >= INPUT_PAD_W && 1663 curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W) 1664 pblockA00[pos] = src0_read0[pos * DILATION_X]; 1665 else 1666 pblockA00[pos] = 0; 1667 }) 1668 curr_y0 += DILATION_Y; 1669 Dtype_t blockA01; 1670 Dtype* pblockA01 = (Dtype*)(&blockA01); 1671 pos = 0; 1672 LOOP(KERNEL_WIDTH, pos, 1673 { 1674 if (curr_y1 >= INPUT_PAD_H && 1675 curr_y1 < input_height + INPUT_PAD_H && 1676 curr_x1 + pos * DILATION_X >= INPUT_PAD_W && 1677 curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W) 1678 pblockA01[pos] = src0_read1[pos * DILATION_X]; 1679 else 1680 pblockA01[pos] = 0; 1681 }) 1682 curr_y1 += DILATION_Y; 1683 src0_read0 += (ROW_PITCH * DILATION_Y); 1684 src0_read1 += (ROW_PITCH * DILATION_Y); 1685#endif 1686 Dtype blockB00[KERNEL_WIDTH*2]; 1687 Dtype4* p4BlockB00 = (Dtype4*)blockB00; 1688 Dtype2* p2BlockB00 = (Dtype2*)blockB00; 1689 Dtype* pBlockB00 = (Dtype* )blockB00; 1690 1691 interleaved_y = 0; 1692 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 1693 { 1694 p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) ); 1695 src1_read += WIDTH1 * 2; 1696 } ) 1697 if ( kernel_width_is_odd ) 1698 { 1699 p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) ); 1700 src1_read += WIDTH1 * 2; 1701 } 1702 // Perform MADs 1703 kernel_idx = 0; 1704 interleaved_y = 0; 1705 LOOP(KERNEL_WIDTH_DIV2, interleaved_y, 1706 { 1707 kernel_y = interleaved_y * 2; 1708 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); 1709 DOT_PRODUCT_16( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 1710 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); 1711 DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 1712 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); 1713 DOT_PRODUCT_16( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; 1714 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); 1715 DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; 1716 } ) 1717 if ( kernel_width_is_odd ) 1718 { 1719 kernel_y = interleaved_y * 2; 1720 DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); 1721 DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 1722 DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); 1723 DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; 1724 } 1725 } 1726 1727 //while( ++patch_row < 1 ); //debug 1728 while( ++patch_row < KERNEL_HEIGHT ); 1729#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0 1730 curr_y0 = saved_y0; 1731 curr_y1 = saved_y1; 1732#endif 1733 // reset to start of next slice of patch 1734 src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); 1735 src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); 1736 } 1737 //while ( ++patch_depth < 1 ); //debug 1738 while ( ++patch_depth < INPUT_DEPTH ); 1739 1740 // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: 1741 // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. 1742 int out0_offset = global_z * out_pitch_z // batch offset 1743 + ( group_x * TILE_N ) * out_pitch_y // channel offset 1744 + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset 1745 + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset 1746 int out1_offset = global_z * out_pitch_z // batch offset 1747 + ( group_x * TILE_N ) * out_pitch_y // channel offset 1748 + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset 1749 + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset 1750 1751#if APPLY_BIAS 1752 Dtype bias[2]; 1753 Dtype2 *bias_vec; 1754 bias_vec = (Dtype2*)bias; 1755 *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N)); 1756 if (group_x > 0xFFFFFFFEul) { 1757 dst[0] = bias[0] + bias[1]; 1758 } 1759#else 1760 const Dtype bias[2] = {0, 0}; 1761#endif 1762 INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0); 1763 INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1); 1764 } 1765} 1766#endif 1767 1768#elif defined KERNEL_DWCONV 1769 1770__kernel void DWCONV( 1771 ELTWISE_DATA_ARG 1772 FUSED_ARG 1773 __global Dtype* image_data, 1774 __global Dtype* kernel_data, 1775 BIAS_KERNEL_ARG 1776 __global Dtype* convolved_image_base, 1777 const int convolved_image_offset, 1778 const ushort input_width, 1779 const ushort input_height, 1780 const ushort output_width, 1781 const ushort output_height) { 1782 __global Dtype* convolved_image = convolved_image_base + convolved_image_offset; 1783 const int outputX = get_global_id(0); 1784 const int outputY = get_global_id(1); 1785 const int outputZ = get_global_id(2); 1786 if(outputX < output_width && outputY < output_height) 1787 { 1788 Dtype sum = 0.; 1789 1790 const int org_y = outputY * STRIDE_Y - INPUT_PAD_H; 1791 const int org_x = outputX * STRIDE_X - INPUT_PAD_W; 1792 const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS); 1793 const int biasIndex=outputZ%CHANNELS; 1794 const int local_image_offset = org_y*input_width + org_x; 1795 const int imageSize = input_width*input_height; 1796 1797 __global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset)); 1798 __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset)); 1799 1800 for(int y = 0; y < KERNEL_H; y++) 1801 { 1802 for(int x = 0; x < KERNEL_W; x++) 1803 { 1804 if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width)) 1805 { 1806 continue; 1807 } 1808 sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x]; 1809 } 1810 image_dataPtrFloat += input_width * DILATION_Y; 1811 kernel_dataPtrFloat += KERNEL_W; 1812 } 1813 1814 #if APPLY_BIAS 1815 int offset = outputZ*output_height*output_width + outputY*output_width + outputX; 1816 ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex); 1817 #else 1818 int offset = outputZ*output_height*output_width + outputY*output_width + outputX; 1819 ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex); 1820 #endif 1821 } 1822} 1823#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV 1824