1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2017, Intel Corporation, all rights reserved.
14// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20//   * Redistribution's of source code must retain the above copyright notice,
21//     this list of conditions and the following disclaimer.
22//
23//   * Redistribution's in binary form must reproduce the above copyright notice,
24//     this list of conditions and the following disclaimer in the documentation
25//     and/or other materials provided with the distribution.
26//
27//   * The name of the copyright holders may not be used to endorse or promote products
28//     derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43#if defined(cl_khr_fp16)
44#pragma OPENCL EXTENSION cl_khr_fp16 : enable
45#endif
46
47#define KERNEL_ARG_DTYPE float
48#define TYPE_FLOAT  1
49#define TYPE_HALF   2
50
51#if defined(FUSED_CONV_RELU)
52#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope)))
53#define FUSED_ARG KERNEL_ARG_DTYPE negative_slope,
54#elif defined(FUSED_CONV_PRELU)
55#define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (negative_slope[c])))
56#define FUSED_ARG __global const KERNEL_ARG_DTYPE* negative_slope,
57#elif defined(FUSED_CONV_POWER)
58#define ACTIVATION_RELU_FUNCTION(x, c) pow(x, (Dtype)power)
59#define FUSED_ARG KERNEL_ARG_DTYPE power,
60#elif defined(FUSED_CONV_TANH)
61#define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
62#define FUSED_ARG
63#elif defined(FUSED_CONV_RELU6)
64#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), (Dtype)min_value, (Dtype)max_value))
65#define FUSED_ARG KERNEL_ARG_DTYPE min_value, KERNEL_ARG_DTYPE max_value,
66#else
67#define ACTIVATION_RELU_FUNCTION(x, c) (x)
68#define FUSED_ARG
69#endif
70
71#ifdef FUSED_CONV_ELTWISE
72#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
73    const Dtype _x_ = eltwise_data[(_offset_)] + (_data_); \
74    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
75} while(0)
76#define ELTWISE_DATA_ARG __global Dtype* eltwise_data,
77#else
78#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_, _channel_) do { \
79    const Dtype _x_ = (_data_); \
80    (_dst_)[(_offset_)] = ACTIVATION_RELU_FUNCTION(_x_, _channel_); \
81} while(0)
82#define ELTWISE_DATA_ARG
83#endif
84
85#if APPLY_BIAS
86#define BIAS_KERNEL_ARG __global Dtype * biases_base,
87#else
88#define BIAS_KERNEL_ARG
89#endif
90
91#define __CAT(x, y) x##y
92#define CAT(x, y) __CAT(x, y)
93#define LOOP0(VAR, STMT)
94#define LOOP1(VAR, STMT) (STMT); (VAR)++;
95#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
96#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
97#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
98#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
99#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
100#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
101#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
102#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
103#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
104#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
105#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
106#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
107#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
108#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
109#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
110#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
111
112#if defined(convolve_simd) || defined(Conv_Interleaved)
113#if TYPE == TYPE_HALF
114#define INT_TYPE ushort
115#define INT_TYPE2 ushort2
116#define INT_TYPE4 ushort4
117#define INT_TYPE8 ushort8
118#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read_us2
119#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read_us4
120#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read_us8
121#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read_us
122#else
123#define INT_TYPE uint
124#define INT_TYPE2 uint2
125#define INT_TYPE4 uint4
126#define INT_TYPE8 uint8
127#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2
128#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4
129#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
130#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
131#endif
132#endif
133
134#ifdef KERNEL_BASIC
135
136__kernel void ConvolveBasic(
137    ELTWISE_DATA_ARG
138    FUSED_ARG
139    __global Dtype* image_data,
140    int image_offset,
141    __global Dtype* kernel_data,
142    int kernel_offset,
143    __global Dtype* bias,
144    const int bias_offset,
145    __global Dtype* convolved_image_base,
146    const int convolved_image_base_offset,
147    const int convolved_image_offset,
148    const ushort input_width,
149    const ushort input_height,
150    const ushort output_width,
151    const ushort output_height,
152    const ushort pad_w,
153    const ushort pad_h
154)
155{
156    __global Dtype* convolved_image = convolved_image_base + convolved_image_base_offset;
157    const int outputX = get_global_id(0);
158    const int outputY = get_global_id(1);
159    const int kernelNum = get_global_id(2) * ZPAR;
160    if (outputX < output_width && outputY < output_height)
161    {
162        Dtype sum[ZPAR];
163        for (int kern = 0; kern < ZPAR; kern++)
164        {
165            sum[kern] = 0.0f;
166        }
167        const int org_y = outputY * STRIDE_Y - pad_h;
168        const int org_x = outputX * STRIDE_X - pad_w;
169        const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
170#if APPLY_BIAS
171        const int biasIndex = bias_offset + kernelNum;
172#endif
173        const int local_image_offset = org_y * input_width + org_x;
174        const int imageSize = input_width * input_height;
175        __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
176        __global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));
177        for (int c = 0; c < CHANNELS; c++)
178        {
179            for (int y = 0; y < KERNEL_HEIGHT; y++)
180            {
181                for (int x = 0; x < KERNEL_WIDTH; x++)
182                {
183                    int y_ = org_y + y * DILATION_Y;
184                    int x_ = org_x + x * DILATION_X;
185                    if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
186                    {
187                        continue;
188                    }
189                    for (int kern = 0; kern < ZPAR; kern++)
190                    {
191                        sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
192                    }
193                }
194                image_dataPtr += input_width * DILATION_Y;
195                kernel_dataPtr += KERNEL_WIDTH;
196            }
197            image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
198        }
199
200        for (int kern = 0; kern < ZPAR; kern++)
201        {
202            if (kernelNum + kern < OUTPUT_Z)
203            {
204                int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
205#if APPLY_BIAS
206                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern], biasIndex + kern);
207#else
208                ACTIVATION_FUNCTION(convolved_image, offset, sum[kern], kernelNum + kern);
209#endif
210            }
211        }
212    }
213}
214
215#elif defined KERNEL_IDLF
216
217// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
218// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the input image.
219// NDRange:  (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
220
221// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
222__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
223__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
224__kernel void
225convolve_simd(
226    ELTWISE_DATA_ARG
227    FUSED_ARG
228    __global Dtype* inputs,
229    __global Dtype* weights,
230    BIAS_KERNEL_ARG
231    __global Dtype* outputs_base,
232    const int outputs_offset,
233    const ushort input_width,
234    const ushort input_height,
235    const ushort output_width,
236    const ushort output_height)
237{
238  __global Dtype* outputs = outputs_base + outputs_offset;
239  unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
240  unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
241  unsigned int fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth
242  unsigned int fmg = get_group_id(2);
243  unsigned int lid = get_local_id(2);
244
245  Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0.0f };
246
247  // find weights address of given neuron (lid is index)
248  unsigned int weight_addr = (fmg % FILTERS_IN_GROUP) *
249                             INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
250
251  unsigned int num_in_batch = fm / ALIGNED_NUM_FILTERS;
252
253  unsigned int input_batch_offset = num_in_batch * INPUT_PITCH * TOTAL_INPUT_DEPTH_SIZE;
254
255  int curr_y = or * STRIDE_Y;
256  int curr_x = oc * STRIDE_X + lid;
257
258  int in_addr = input_batch_offset
259                +  (curr_y - INPUT_PAD_H) * INPUT_WIDTH          // y tile offset
260                +   curr_x - INPUT_PAD_W;                        // x tile offset
261
262  const int in_limit = (get_global_size(2) / ALIGNED_NUM_FILTERS) * TOTAL_INPUT_DEPTH_SIZE * INPUT_PITCH - 1;
263
264  Dtype in_buf[INVEC_SIZE];
265
266  for(int kd = 0; kd < INPUT_DEPTH; kd++)
267  {
268#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
269    const bool cx_out_of_range = !(curr_x >= INPUT_PAD_W && curr_x < INPUT_WIDTH + INPUT_PAD_W);
270    int in_offset = in_addr;
271    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
272    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
273    {
274      Dtype input = inputs[clamp(in_offset, 0, in_limit)];
275      int cy = curr_y + reg;
276      in_buf[reg] = (cx_out_of_range || cy < INPUT_PAD_H || cy >= INPUT_HEIGHT + INPUT_PAD_H) ? 0 : input;
277    }
278#else
279    int in_offset = in_addr;
280    __attribute__((opencl_unroll_hint(INVEC_SIZE)))
281    for (int reg = 0; reg < INVEC_SIZE; reg++, in_offset += INPUT_WIDTH)
282    {
283      in_buf[reg] = inputs[min(in_offset, in_limit)];
284    }
285#endif
286
287    in_addr += INPUT_PITCH;
288
289#define BLOCK_IN(n, c) intel_sub_group_shuffle(in_buf[n], (c))
290
291    int kr = 0;  // kr = Kernel Row
292    LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
293    {
294        int kc = 0;  // kc = Kernel Column
295        LOOP(KERNEL_WIDTH, kc,
296        {
297            Dtype weight_value = weights[weight_addr];
298            weight_addr += SIMD_SIZE;
299            for (int br=0; br < OUT_BLOCK_HEIGHT; br++)
300            {
301                for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++)
302                {
303                    Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y), bc * STRIDE_X + kc * DILATION_X);
304                    out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_value, input, out[br * OUT_BLOCK_WIDTH + bc]);
305                }
306            }
307        });
308    });
309  }
310
311  fm = fm % ALIGNED_NUM_FILTERS;
312
313#if LEFT_FILTERS > 0
314  if (fm < NUM_FILTERS)
315#endif
316  {
317    unsigned int out_addr = (num_in_batch * TOTAL_OUTPUT_DEPTH + fm) * OUTPUT_PITCH;
318    out_addr += or * output_width + oc;
319    // we need this address calculation for biases because we support views and batching
320#if APPLY_BIAS
321    Dtype bias = biases_base[fm];
322#else
323    Dtype bias = 0;
324#endif
325
326    for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++)
327    {
328      if (r + or >= output_height) break;
329      for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++)
330      {
331        if (c + oc >= output_width) break;
332        // this does a scattered write to SIMD_SIZE different feature maps,
333        // so that data within one map is contiguous, thus ready for input to next layer.
334        ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c], fm);
335      }
336    }
337  }
338}
339
340#elif defined KERNEL_GEMM_LIKE
341
342#if APPLY_BIAS
343#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
344#else
345#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
346#endif
347
348#ifdef Conv_Interleaved
349typedef struct float1 { float s0; } float1;
350typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;
351typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;
352typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;
353typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;
354typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;
355                         float s6; float s7; float s8; float s9;} float10;
356typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;
357                         float s6; float s7; float s8; float s9; float sa;} float11;
358typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;
359                         float s6; float s7; float s8; float s9; float sa; float sb; } float12;
360typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;
361                         float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;
362typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;
363                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;
364typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;
365                         float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
366typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
367
368typedef struct half1 { half s0; } half1;
369typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5;
370typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6;
371typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7;
372typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9;
373typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5;
374                        half s6; half s7; half s8; half s9; } half10;
375typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5;
376                        half s6; half s7; half s8; half s9; half sa; } half11;
377typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5;
378                        half s6; half s7; half s8; half s9; half sa; half sb; } half12;
379typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5;
380                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; } half13;
381typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5;
382                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; } half14;
383typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5;
384                        half s6; half s7; half s8; half s9; half sa; half sb; half sc; half sd; half se; } half15;
385typedef struct half0 { half s0; } half0; //never used but makes compiler happy.
386
387#define OUT_PITCH_X output_width
388#define ROW_PITCH input_width
389
390#define GEMM_LIKE_KERNEL_ARGS     \
391    ELTWISE_DATA_ARG              \
392    FUSED_ARG                     \
393    const __global Dtype *src0,   \
394    const __global Dtype *src1,   \
395    BIAS_KERNEL_ARG               \
396    __global Dtype *dst_base,     \
397    const int dst_offset,         \
398    const ushort input_width,     \
399    const ushort input_height,    \
400    const ushort output_width,    \
401    const ushort output_height,   \
402    const int out_pitch_y,     \
403    const int out_pitch_z,     \
404    const int aligned_input_size, \
405    const int slice_pitch
406#endif
407
408#ifdef GEMM_LIKE_CONV_32_1
409//////////////////////////////////////////////////////////////////////////////
410// Conv_Interleaved_32_1_flex
411//
412// Convolution: each workitem computes 1 patch x 32 filters worth of output
413// data.  Kernel's inner loop works on a single tile consisting of one
414// row from each patch and the filter data corresponding to that row.  Filter
415// matrix is interleaved to reduce GRF bank conflicts.  Patches are walked
416// by rows and then by slices.  Relies on sub_group extension for block
417// reads and SIMD broadcast.  Allows flexible sizing of TILE width (TILE_N)
418// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
419// the other uses TILE_N = 8, 16, or 24.
420#define TILE_M          1
421#define TILE_K          KERNEL_WIDTH
422#define TILE_N          32
423
424__attribute__((intel_reqd_sub_group_size(8)))
425__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
426{
427    __global Dtype *dst = dst_base + dst_offset;
428    const int group_x = get_group_id(0);
429    const int group_y = get_group_id(1);
430    const int global_x = get_global_id(0);
431    const int global_y = get_global_id(1);
432    const int global_z = get_global_id(2);
433    int interleaved_y;
434    int kernel_y;
435    int kernel_idx;
436
437#define DOT_PRODUCT_8( _result, _rowA, colB )    \
438    {   \
439        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
440        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
441        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
442        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
443        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
444        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
445        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
446        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
447    }
448    typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
449
450    // True for all threads if filter_width is multiple of TILE_N
451    // else, true for all but right-most column of threads.
452    if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
453    {
454        // Result ctile (*dst) is M rows x N columns
455        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
456        Dtype8  blockC00 = 0.f;
457        Dtype8  blockC10 = 0.f;
458        Dtype8  blockC20 = 0.f;
459        Dtype8  blockC30 = 0.f;
460
461        // Src0 (patch input) is directly used as atile.
462        // Each work item points to the start of a different patch.
463        // atile is M rows x K columns.
464        int curr_x = ( global_y % output_width ) * STRIDE_X;
465        int curr_y = ( global_y / output_width ) * STRIDE_Y;
466#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
467        int saved_y = curr_y;
468#endif
469        const __global Dtype *src0_read = src0
470          + aligned_input_size * global_z           // batch offset
471          + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
472          + (curr_x - INPUT_PAD_W);                 // x offset
473
474        // Src1 (filter) is directly used as btile.
475        // It starts at the top of src1 and walks down.
476        // btile is K rows x N columns.
477        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
478
479        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
480        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
481        // and KERNEL_WIDTH/2 rows of interleaved filter.
482        int patch_depth = 0;
483        do
484        {
485            int patch_row = 0;
486#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
487            curr_y = saved_y;
488#endif
489
490            do
491            {
492                // Load atile and btile.
493                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
494                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
495                // interleaved row is padded with zero to ensure same size as interleaved rows. This
496                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
497                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
498                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
499                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
500                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
501                // ...
502                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
503
504#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
505  #if KERNEL_WIDTH == 3
506                Dtype_t blockA00 = vload3(0, src0_read);
507                Dtype*  pblockA00 = (Dtype*)(&blockA00);
508  #else
509                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
510                Dtype*  pblockA00 = (Dtype*)(&blockA00);
511  #endif
512#else
513                Dtype_t blockA00;
514                Dtype*  pblockA00 = (Dtype*)(&blockA00);
515                int pos = 0;
516                LOOP(KERNEL_WIDTH, pos,
517                {
518                  if (curr_y >= INPUT_PAD_H &&
519                      curr_y < input_height + INPUT_PAD_H &&
520                      curr_x + pos * DILATION_X >= INPUT_PAD_W &&
521                      curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
522                    pblockA00[pos] = src0_read[pos * DILATION_X];
523                  else
524                    pblockA00[pos] = 0;
525                })
526                curr_y += DILATION_Y;
527#endif
528                src0_read += (ROW_PITCH * DILATION_Y);
529
530                Dtype blockB00[KERNEL_WIDTH*4];
531                Dtype8* p8BlockB00 = (Dtype8*)blockB00;
532                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
533                Dtype*  pBlockB00 =  (Dtype* )blockB00;
534
535                interleaved_y = 0;
536                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
537                {
538                    p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );
539                    src1_read += WIDTH1 * 2;
540                } )
541                if ( kernel_width_is_odd )
542                {
543                    p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );
544                    src1_read += WIDTH1 * 2;
545                }
546
547                // Perform MADs
548                kernel_idx = 0;
549                interleaved_y = 0;
550                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
551                {
552                    kernel_y = interleaved_y * 2;
553                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
554                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
555                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
556                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
557                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
558                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
559                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
560                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
561                } )
562                    kernel_y = interleaved_y * 2;
563                if ( kernel_width_is_odd )
564                {
565                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
566                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
567                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
568                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
569                }
570            }
571
572            //while( ++patch_row < 1 ); //debug
573            while( ++patch_row < KERNEL_HEIGHT );
574
575            // reset to start of next slice of patch
576            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
577        }
578        //while ( ++patch_depth < 1 ); //debug
579        while ( ++patch_depth < INPUT_DEPTH );
580
581        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
582        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
583        int out_offset = global_z * out_pitch_z                                        // batch offset
584         + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
585         + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
586         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
587
588        __global Dtype *out = dst + out_offset;
589#if APPLY_BIAS
590        Dtype bias[4];
591        Dtype4 *bias_vec;
592        bias_vec = (Dtype4*)bias;
593        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
594        if (group_x > 0xFFFFFFFEul) {
595          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
596        }
597#else
598        const Dtype bias[4] = {0, 0, 0, 0};
599#endif
600        if (global_y * TILE_M < output_width * output_height )
601        {
602            for (int i = 0; i < 8; i++)
603            {
604            ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
605            ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + 8 + i);
606            ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + 16 + i);
607            ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + 24 + i);
608            }
609        }
610    }
611#if TILE_N_LAST > 0
612    else
613    {
614
615        // Result ctile (*dst) is M rows x N columns
616        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
617        int i = 0;
618        Dtype8  blockC[TILE_N_LAST_DIV8];
619        LOOP(TILE_N_LAST_DIV8, i,
620        {
621            blockC[i] = 0.f;
622        } )
623
624        // Src0 (patch input) is directly used as atile.
625        // Each work item points to the start of a different patch.
626        // atile is M rows x K columns.
627        int curr_x = ( global_y % output_width ) * STRIDE_X;
628        int curr_y = ( global_y / output_width ) * STRIDE_Y;
629#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
630        int saved_y = curr_y;
631#endif
632        const __global Dtype *src0_read = src0
633          + aligned_input_size * global_z           // batch offset
634          + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
635          + (curr_x - INPUT_PAD_W);                 // x offset
636
637        // Src1 (filter) is directly used as btile.
638        // It starts at the top of src1 and walks down.
639        // btile is K rows x N columns.
640        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
641
642        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
643        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
644        // and KERNEL_WIDTH/2 rows of interleaved filter.
645        int patch_depth = 0;
646        do
647        {
648            int patch_row = 0;
649#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
650            curr_y = saved_y;
651#endif
652            do
653            {
654                // Load atile and interleaved btile.
655                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
656#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
657                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
658                Dtype*  pblockA00 = (Dtype*)(&blockA00);
659#else
660                Dtype_t blockA00;
661                Dtype*  pblockA00 = (Dtype*)(&blockA00);
662                int pos = 0;
663                LOOP(KERNEL_WIDTH, pos,
664                {
665                  if (curr_y >= INPUT_PAD_H &&
666                      curr_y < input_height + INPUT_PAD_H &&
667                      curr_x + pos * DILATION_X >= INPUT_PAD_W &&
668                      curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
669                    pblockA00[pos] = src0_read[pos * DILATION_X];
670                  else
671                    pblockA00[pos] = 0;
672                })
673                curr_y += DILATION_Y;
674#endif
675                src0_read += (ROW_PITCH * DILATION_Y);
676                Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
677
678                interleaved_y = 0;
679                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
680                {
681#if TILE_N_LAST_DIV8 == 1
682                    Dtype2* p2BlockB = (Dtype2* )blockB;
683                    p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
684#elif TILE_N_LAST_DIV8 == 2
685                    Dtype4* p4BlockB = (Dtype4* )blockB;
686                    p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
687#elif TILE_N_LAST_DIV8 == 3
688                    //TODO: broken.  No block_read6
689                    Dtype6* p6BlockB = (Dtype6* )blockB;
690                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
691                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
692#endif
693                    src1_read += WIDTH1 * 2;
694                } )
695                if ( kernel_width_is_odd )
696                {
697#if TILE_N_LAST_DIV8 == 1
698                    Dtype* pBlockB = (Dtype* )blockB;
699                    pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
700#elif TILE_N_LAST_DIV8 == 2
701                    Dtype2* p2BlockB = (Dtype2* )blockB;
702                    p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
703#elif TILE_N_LAST_DIV8 == 3
704                    Dtype3* p3BlockB = (Dtype3* )blockB;
705                    p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
706                    p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );
707#endif
708                    src1_read += WIDTH1 * 2;
709                }
710
711                // Perform MADs
712                Dtype* pBlockB = (Dtype*)blockB;
713                kernel_idx = 0;
714                interleaved_y = 0;
715                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
716                {
717                    kernel_y = interleaved_y * 2;
718                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
719                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
720#if TILE_N_LAST_DIV8 >= 2
721                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
722                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
723#if TILE_N_LAST_DIV8 >= 3
724                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
725                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
726#endif
727#endif
728                } )
729                    kernel_y = interleaved_y * 2;
730                if ( kernel_width_is_odd )
731                {
732                    DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
733#if TILE_N_LAST_DIV8 >= 2
734                    DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
735#if TILE_N_LAST_DIV8 >= 3
736                    DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
737#endif
738#endif
739                }
740            }
741
742            //while( ++patch_row < 1 ); //debug
743            while( ++patch_row < KERNEL_HEIGHT );
744
745            // reset to start of next slice of patch
746            src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
747        }
748        //while ( ++patch_depth < 1 );  //debug
749        while ( ++patch_depth < INPUT_DEPTH );
750
751        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
752        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
753        int out_offset = global_z * out_pitch_z                                        // batch offset
754         + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
755         + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
756         + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
757        __global Dtype *out = dst + out_offset;
758#if APPLY_BIAS
759        Dtype bias[4];
760        Dtype4 *bias_vec;
761        bias_vec = (Dtype4*)bias;
762        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
763        if (group_x > 0xFFFFFFFEul) {
764          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
765        }
766#else
767        const Dtype bias[4] = {0, 0, 0, 0};
768#endif
769
770        if (global_y * TILE_M < output_width * output_height )
771        {
772            for (int i = 0; i < 8; i++)
773            {
774                if ( TILE_N_LAST_DIV8 > 0 )
775                {
776                  ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
777                }
778                if ( TILE_N_LAST_DIV8 > 1 )
779                {
780                  ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
781                }
782                if ( TILE_N_LAST_DIV8 > 2 )
783                {
784                  ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
785                }
786                if ( TILE_N_LAST_DIV8 > 3 )
787                {
788                  ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
789                }
790            }
791        }
792    }
793#endif
794}
795#endif
796#ifdef GEMM_LIKE_CONV_32_2
797
798//////////////////////////////////////////////////////////////////////////////
799// Conv_Interleaved_32_2_flex
800//
801// Convolution: each workitem computes 1 patch x 32 filters worth of output
802// data.  Kernel's inner loop works on a single tile consisting of one
803// row from each patch and the filter data corresponding to that row.  Filter
804// matrix is interleaved to reduce GRF bank conflicts.  Patches are walked
805// by rows and then by slices.  Relies on sub_group extension for block
806// reads and SIMD broadcast.  Allows flexible sizing of TILE width (TILE_N)
807// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
808// the other uses TILE_N = 8, 16, or 24.
809#define TILE_M          2
810#define TILE_K          KERNEL_WIDTH
811#define TILE_N          32
812
813__attribute__((intel_reqd_sub_group_size(8)))
814__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
815{
816    __global Dtype *dst = dst_base + dst_offset;
817    const int group_x = get_group_id(0);
818    const int group_y = get_group_id(1);
819    const int global_x = get_global_id(0);
820    const int global_y = get_global_id(1);
821    const int global_z = get_global_id(2);
822    int interleaved_y;
823    int kernel_y;
824    int kernel_idx;
825
826#define DOT_PRODUCT_8( _result, _rowA, colB )    \
827    {   \
828        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
829        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
830        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
831        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
832        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
833        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
834        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
835        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
836    }
837        typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
838
839    // True for all threads if filter_width is multiple of TILE_N
840    // else, true for all but right-most column of threads.
841    if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
842    {
843        // Result ctile (*dst) is M rows x N columns
844        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
845        Dtype8  blockC00 = 0.f;
846        Dtype8  blockC10 = 0.f;
847        Dtype8  blockC20 = 0.f;
848        Dtype8  blockC30 = 0.f;
849        Dtype8  blockC01 = 0.f;
850        Dtype8  blockC11 = 0.f;
851        Dtype8  blockC21 = 0.f;
852        Dtype8  blockC31 = 0.f;
853
854        // Src0 (patch input) is directly used as atile.
855        // Each work item points to the start of a different patch.
856        // atile is M rows x K columns.
857        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
858        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
859        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
860        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
861#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
862        int saved_y0 = curr_y0;
863        int saved_y1 = curr_y1;
864#endif
865        const __global Dtype *src0_read0 = src0
866         + aligned_input_size * global_z         // batch offset
867         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
868         + curr_x0 - INPUT_PAD_W;                // x offset
869        const __global Dtype *src0_read1 = src0
870         + aligned_input_size * global_z         // batch offset
871         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
872         + curr_x1 - INPUT_PAD_W;                // x offset
873
874        // Src1 (filter) is directly used as btile.
875        // It starts at the top of src1 and walks down.
876        // btile is K rows x N columns.
877        const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
878
879        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
880        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
881        // and KERNEL_WIDTH/2 rows of interleaved filter.
882        int patch_depth = 0;
883        do
884        {
885            int patch_row = 0;
886            do
887            {
888                // Load atile and btile.
889                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
890                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
891                // interleaved row is padded with zero to ensure same size as interleaved rows. This
892                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
893                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
894                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
895                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
896                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
897                // ...
898                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
899#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
900  #if KERNEL_WIDTH == 3
901                Dtype_t blockA00 = vload3(0, src0_read0); src0_read0 += ROW_PITCH;
902                Dtype_t blockA01 = vload3(0, src0_read1); src0_read1 += ROW_PITCH;
903                Dtype*  pblockA00 = (Dtype*)(&blockA00);
904                Dtype*  pblockA01 = (Dtype*)(&blockA01);
905  #else
906                Dtype_t blockA00 = { (Dtype)0.f };
907                Dtype_t blockA01 = { (Dtype)0.f };
908                Dtype*  pblockA00 = (Dtype*)(&blockA00);
909                Dtype*  pblockA01 = (Dtype*)(&blockA01);
910                int pos = 0;
911                LOOP(KERNEL_WIDTH, pos,
912                {
913                  if (curr_x0 + pos < input_width)
914                    pblockA00[pos] = src0_read0[pos];
915
916                  if (curr_x1 + pos < input_width)
917                    pblockA01[pos] = src0_read1[pos];
918                })
919                src0_read0 += ROW_PITCH;
920                src0_read1 += ROW_PITCH;
921  #endif
922#else
923                Dtype_t blockA00;
924                Dtype*  pblockA00 = (Dtype*)(&blockA00);
925                int pos = 0;
926                LOOP(KERNEL_WIDTH, pos,
927                {
928                  if (curr_y0 >= INPUT_PAD_H &&
929                      curr_y0 < input_height + INPUT_PAD_H &&
930                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
931                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
932                    pblockA00[pos] = src0_read0[pos * DILATION_X];
933                  else
934                    pblockA00[pos] = 0;
935                })
936                curr_y0 += DILATION_Y;
937                Dtype_t blockA01;
938                Dtype*  pblockA01 = (Dtype*)(&blockA01);
939                pos = 0;
940                LOOP(KERNEL_WIDTH, pos,
941                {
942                  if (curr_y1 >= INPUT_PAD_H &&
943                      curr_y1 < input_height + INPUT_PAD_H &&
944                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
945                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
946                    pblockA01[pos] = src0_read1[pos * DILATION_X];
947                  else
948                    pblockA01[pos] = 0;
949                })
950                curr_y1 += DILATION_Y;
951                src0_read0 += (ROW_PITCH * DILATION_Y);
952                src0_read1 += (ROW_PITCH * DILATION_Y);
953#endif
954                Dtype blockB00[KERNEL_WIDTH*4];
955                Dtype8* p8BlockB00 = (Dtype8*)blockB00;
956                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
957                Dtype*  pBlockB00 =  (Dtype* )blockB00;
958
959                interleaved_y = 0;
960                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
961                {
962                    p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );
963                    src1_read += WIDTH1 * 2;
964                } )
965                if ( kernel_width_is_odd )
966                {
967                    p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
968                    src1_read += WIDTH1 * 2;
969                }
970                // Perform MADs
971                kernel_idx = 0;
972                interleaved_y = 0;
973                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
974                {
975                    kernel_y = interleaved_y * 2;
976                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
977                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
978                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
979                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
980                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
981                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
982                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
983                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
984                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
985                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
986                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
987                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
988                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
989                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
990                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
991                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
992                } )
993                if ( kernel_width_is_odd )
994                {
995                    kernel_y = interleaved_y * 2;
996                    DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
997                    DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
998                    DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
999                    DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1000                    DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1001                    DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1002                    DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1003                    DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1004                }
1005            }
1006
1007            //while( ++patch_row < 1 ); //debug
1008            while( ++patch_row < KERNEL_HEIGHT );
1009#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1010            curr_y0 = saved_y0;
1011            curr_y1 = saved_y1;
1012#endif
1013            // reset to start of next slice of patch
1014            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1015            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1016        }
1017        //while ( ++patch_depth < 1 );  //debug
1018        while ( ++patch_depth < INPUT_DEPTH );
1019
1020        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1021        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1022        int out0_offset = global_z * out_pitch_z                                           // batch offset
1023         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1024         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1025         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1026        int out1_offset = global_z * out_pitch_z                                           // batch offset
1027         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1028         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1029         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1030
1031#if APPLY_BIAS
1032        Dtype bias[4];
1033        Dtype4 *bias_vec;
1034        bias_vec = (Dtype4*)bias;
1035        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1036        if (group_x > 0xFFFFFFFEul) {
1037          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1038        }
1039#else
1040        const Dtype bias[4] = {0, 0, 0, 0};
1041#endif
1042
1043        if( global_y * TILE_M < output_width * output_height )
1044        {
1045            for( int i = 0; i < 8; i++ )
1046            {
1047                ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1048                ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1049                ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1050                ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1051            }
1052        }
1053        if( global_y * TILE_M + 1 < output_width * output_height )
1054        {
1055            for( int i = 0; i < 8; i++ )
1056            {
1057                ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1058                ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1059                ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1060                ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1061            }
1062        }
1063    }
1064#if TILE_N_LAST > 0
1065    else
1066    {
1067
1068        // Result ctile (*dst) is M rows x N columns
1069        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
1070        int i = 0;
1071        Dtype8  blockC0[TILE_N_LAST_DIV8];
1072        Dtype8  blockC1[TILE_N_LAST_DIV8];
1073        LOOP(TILE_N_LAST_DIV8, i,
1074        {
1075            blockC0[i] = 0.f;
1076            blockC1[i] = 0.f;
1077        } )
1078
1079        // Src0 (patch input) is directly used as atile.
1080        // Each work item points to the start of a different patch.
1081        // atile is M rows x K columns.
1082        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
1083        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
1084        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
1085        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
1086#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1087        int saved_y0 = curr_y0;
1088        int saved_y1 = curr_y1;
1089#endif
1090        const __global Dtype *src0_read0 = src0
1091         + aligned_input_size * global_z         // batch offset
1092         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
1093         + curr_x0 - INPUT_PAD_W;                // x offset
1094        const __global Dtype *src0_read1 = src0
1095         + aligned_input_size * global_z         // batch offset
1096         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
1097         + curr_x1 - INPUT_PAD_W;                // x offset
1098
1099        // Src1 (filter) is directly used as btile.
1100        // It starts at the top of src1 and walks down.
1101        // btile is K rows x N columns.
1102        const __global Dtype *src1_read = src1 + ( global_x * TILE_N  * 2);
1103
1104        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1105        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1106        // and KERNEL_WIDTH/2 rows of interleaved filter.
1107        int patch_depth = 0;
1108        do
1109        {
1110            int patch_row = 0;
1111            do
1112            {
1113                // Load atile and interleaved btile.
1114                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1115#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1116                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
1117                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
1118                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1119                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1120#else
1121                Dtype_t blockA00;
1122                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1123                int pos = 0;
1124                LOOP(KERNEL_WIDTH, pos,
1125                {
1126                  if (curr_y0 >= INPUT_PAD_H &&
1127                      curr_y0 < input_height + INPUT_PAD_H &&
1128                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
1129                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
1130                    pblockA00[pos] = src0_read0[pos * DILATION_X];
1131                  else
1132                    pblockA00[pos] = 0;
1133                })
1134                curr_y0 += DILATION_Y;
1135                Dtype_t blockA01;
1136                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1137                pos = 0;
1138                LOOP(KERNEL_WIDTH, pos,
1139                {
1140                  if (curr_y1 >= INPUT_PAD_H &&
1141                      curr_y1 < input_height + INPUT_PAD_H &&
1142                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
1143                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
1144                    pblockA01[pos] = src0_read1[pos * DILATION_X];
1145                  else
1146                    pblockA01[pos] = 0;
1147                })
1148                curr_y1 += DILATION_Y;
1149                src0_read0 += (ROW_PITCH * DILATION_Y);
1150                src0_read1 += (ROW_PITCH * DILATION_Y);
1151#endif
1152                Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
1153
1154                interleaved_y = 0;
1155                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1156                {
1157#if TILE_N_LAST_DIV8 == 1
1158                    Dtype2* p2BlockB = (Dtype2* )blockB;
1159                    p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1160#elif TILE_N_LAST_DIV8 == 2
1161                    Dtype4* p4BlockB = (Dtype4* )blockB;
1162                    p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1163#elif TILE_N_LAST_DIV8 == 3
1164                    //TODO: broken.  No block_read6
1165                    Dtype6* p6BlockB = (Dtype6* )blockB;
1166                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1167                    (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
1168#endif
1169                    src1_read += WIDTH1 * 2;
1170                } )
1171                if ( kernel_width_is_odd )
1172                {
1173#if TILE_N_LAST_DIV8 == 1
1174                    Dtype* pBlockB = (Dtype* )blockB;
1175                    pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
1176#elif TILE_N_LAST_DIV8 == 2
1177                    Dtype2* p2BlockB = (Dtype2* )blockB;
1178                    p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1179#elif TILE_N_LAST_DIV8 == 3
1180                    Dtype3* p3BlockB = (Dtype3* )blockB;
1181                    p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1182                    p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );
1183#endif
1184                    src1_read += WIDTH1 * 2;
1185                }
1186
1187                // Perform MADs
1188                Dtype* pBlockB = (Dtype*)blockB;
1189                kernel_idx = 0;
1190                interleaved_y = 0;
1191                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1192                {
1193                    kernel_y = interleaved_y * 2;
1194                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
1195                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
1196                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1197                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1198#if TILE_N_LAST_DIV8 >= 2
1199                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
1200                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
1201                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1202                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1203#if TILE_N_LAST_DIV8 >= 3
1204                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y    ], pBlockB[kernel_idx] );
1205                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y    ], pBlockB[kernel_idx] ); kernel_idx++;
1206                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
1207                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
1208#endif
1209#endif
1210                } )
1211                    kernel_y = interleaved_y * 2;
1212                if ( kernel_width_is_odd )
1213                {
1214                    DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );
1215                    DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1216#if TILE_N_LAST_DIV8 >= 2
1217                    DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );
1218                    DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1219#if TILE_N_LAST_DIV8 >= 3
1220                    DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );
1221                    DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
1222#endif
1223#endif
1224                }
1225            }
1226
1227            //while( ++patch_row < 1 ); //debug
1228            while( ++patch_row < KERNEL_HEIGHT );
1229#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1230            curr_y0 = saved_y0;
1231            curr_y1 = saved_y1;
1232#endif
1233            // reset to start of next slice of patch
1234            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1235            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1236        }
1237        //while ( ++patch_depth < 1 );  //debug
1238        while ( ++patch_depth < INPUT_DEPTH );
1239
1240        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1241        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1242        int out0_offset = global_z * out_pitch_z                                           // batch offset
1243         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1244         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1245         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1246        int out1_offset = global_z * out_pitch_z                                           // batch offset
1247         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1248         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1249         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1250        __global Dtype *out1 = dst + out1_offset;
1251
1252#if APPLY_BIAS
1253        Dtype bias[4];
1254        Dtype4 *bias_vec;
1255        bias_vec = (Dtype4*)bias;
1256        *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
1257        if (group_x > 0xFFFFFFFEul) {
1258          dst[0] = bias[0] + bias[1] + bias[2] + bias[3];
1259        }
1260#else
1261        const Dtype bias[4] = {0, 0, 0, 0};
1262#endif
1263        if( global_y * TILE_M < output_width * output_height )
1264        {
1265            for( int i = 0; i < 8; i++ )
1266            {
1267                if ( TILE_N_LAST_DIV8 > 0 )
1268                {
1269                  ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1270                }
1271                if ( TILE_N_LAST_DIV8 > 1 )
1272                {
1273                  ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1274                }
1275                if ( TILE_N_LAST_DIV8 > 2 )
1276                {
1277                  ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1278                }
1279                if ( TILE_N_LAST_DIV8 > 3 )
1280                {
1281                  ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1282                }
1283            }
1284        }
1285        if( global_y * TILE_M + 1 < output_width * output_height )
1286        {
1287            for( int i = 0; i < 8; i++ )
1288            {
1289                if ( TILE_N_LAST_DIV8 > 0 )
1290                {
1291                  ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i);
1292                }
1293                if ( TILE_N_LAST_DIV8 > 1 )
1294                {
1295                  ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 8);
1296                }
1297                if ( TILE_N_LAST_DIV8 > 2 )
1298                {
1299                  ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i), group_x * TILE_N + i + 16);
1300                }
1301                if ( TILE_N_LAST_DIV8 > 3 )
1302                {
1303                  ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i), group_x * TILE_N + i + 24);
1304                }
1305            }
1306        }
1307    }
1308#endif
1309}
1310#endif
1311
1312#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)
1313#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_,  _m_) do {\
1314    if (global_y * TILE_M < output_width * output_height ) \
1315    { \
1316      if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\
1317        for (int i = 0; i < 16; i++) \
1318        { \
1319          ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1320          ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1321        } \
1322      } \
1323      else if( ( OUT_DEPTH % 16 ) == 0 ) { \
1324        if ( ( global_x + 1 ) < get_global_size(0) ) { \
1325          for ( int i = 0; i < 16; i++ ) \
1326          { \
1327            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1328            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1329          } \
1330        } \
1331        else { \
1332          for (int i = 0; i < 16; i++) \
1333          { \
1334            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1335          } \
1336        } \
1337      } \
1338      else { \
1339        if ( ( global_x + 1 ) < get_global_size(0) ) \
1340        { \
1341          for ( int i = 0; i < 16; i++ ) \
1342          { \
1343            ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1344            ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1345          } \
1346        } \
1347        else { \
1348          if ( (OUT_DEPTH % TILE_N) > 16 ) { \
1349            for (int i = 0; i < 16 ; i++) \
1350            { \
1351              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1352            } \
1353            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
1354            { \
1355              ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i), group_x * TILE_N + i + 16); \
1356            } \
1357          } \
1358          else { \
1359            for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
1360            { \
1361              ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i), group_x * TILE_N + i); \
1362            } \
1363          } \
1364        } \
1365      } \
1366    } \
1367 }while(0)
1368#endif
1369
1370#ifdef GEMM_LIKE_CONV_32_1_SIMD16
1371#define TILE_M          1
1372#define TILE_K          KERNEL_WIDTH
1373#define TILE_N          32
1374
1375__attribute__((intel_reqd_sub_group_size(16)))
1376__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1377{
1378    __global Dtype *dst = dst_base + dst_offset;
1379    const int group_x = get_group_id(0);
1380    const int group_y = get_group_id(1);
1381    const int global_x = get_global_id(0);
1382    const int global_y = get_global_id(1);
1383    const int global_z = get_global_id(2);
1384    int interleaved_y;
1385    int kernel_y;
1386    int kernel_idx;
1387
1388    // Result ctile (*dst) is M rows x N columns
1389    // LWG size is 1x16.  Thus each thread calculates 16*M rows x N cols of ctile.
1390    Dtype16  blockC00 = 0.f;
1391    Dtype16  blockC10 = 0.f;
1392
1393    // Src0 (patch input) is directly used as atile.
1394    // Each work item points to the start of a different patch.
1395    // atile is M rows x K columns.
1396    int curr_x = ( global_y % output_width ) * STRIDE_X;
1397    int curr_y = ( global_y / output_width ) * STRIDE_Y;
1398#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1399    int saved_y = curr_y;
1400#endif
1401    const __global Dtype *src0_read = src0
1402     + aligned_input_size * global_z           // batch offset
1403     + (curr_y - INPUT_PAD_H) * ROW_PITCH      // y offset
1404     + curr_x - INPUT_PAD_W;                   // x offset
1405     const __global Dtype *src0_read_orig = src0_read;
1406
1407    // Src1 (filter) is directly used as btile.
1408    // It starts at the top of src1 and walks down.
1409    // btile is K rows x N columns.
1410    const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );
1411
1412#define DOT_PRODUCT_16( _result, _rowA, colB )    \
1413    {   \
1414        _result.s0 = mad( _rowA, sub_group_broadcast( colB,  0 ), _result.s0 );  \
1415        _result.s1 = mad( _rowA, sub_group_broadcast( colB,  1 ), _result.s1 );  \
1416        _result.s2 = mad( _rowA, sub_group_broadcast( colB,  2 ), _result.s2 );  \
1417        _result.s3 = mad( _rowA, sub_group_broadcast( colB,  3 ), _result.s3 );  \
1418        _result.s4 = mad( _rowA, sub_group_broadcast( colB,  4 ), _result.s4 );  \
1419        _result.s5 = mad( _rowA, sub_group_broadcast( colB,  5 ), _result.s5 );  \
1420        _result.s6 = mad( _rowA, sub_group_broadcast( colB,  6 ), _result.s6 );  \
1421        _result.s7 = mad( _rowA, sub_group_broadcast( colB,  7 ), _result.s7 );  \
1422        _result.s8 = mad( _rowA, sub_group_broadcast( colB,  8 ), _result.s8 );  \
1423        _result.s9 = mad( _rowA, sub_group_broadcast( colB,  9 ), _result.s9 );  \
1424        _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
1425        _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
1426        _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
1427        _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
1428        _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
1429        _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
1430    }
1431    typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
1432    // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1433    // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1434    // and KERNEL_WIDTH/2 rows of interleaved filter.
1435    int patch_depth = 0;
1436    __attribute__((opencl_unroll_hint(1)))
1437    do
1438    {
1439        int patch_row = 0;
1440#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1441        curr_y = saved_y;
1442#endif
1443        __attribute__((opencl_unroll_hint(1)))
1444        do
1445        {
1446            // Load atile and btile.
1447            // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype16 granularity.
1448            // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
1449            // interleaved row is padded with zero to ensure same size as interleaved rows. This
1450            // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
1451            // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1452            // (0, 0) (16, 0) (32, 0) (48, 0) ...     (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...
1453            // (0, 1) (16, 1) (32, 1) (48, 1) ... =>  (0, 2) (16, 2) (32, 2) (48, 2) ...
1454            // (0, 2) (16, 2) (32, 2) (48, 2) ...     ...
1455            // ...
1456            const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1457
1458#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1459  #if KERNEL_WIDTH == 3
1460            Dtype_t blockA00 = vload3(0, src0_read);
1461            Dtype*  pblockA00 = (Dtype*)(&blockA00);
1462  #else
1463            Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[  0  ];
1464            Dtype*  pblockA00 = (Dtype*)(&blockA00);
1465  #endif
1466#else
1467            Dtype_t blockA00;
1468            Dtype*  pblockA00 = (Dtype*)(&blockA00);
1469            int pos = 0;
1470            LOOP(KERNEL_WIDTH, pos,
1471            {
1472              if (curr_y >= INPUT_PAD_H &&
1473                  curr_y < input_height + INPUT_PAD_H &&
1474                  curr_x + pos * DILATION_X >= INPUT_PAD_W &&
1475                  curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
1476                pblockA00[pos] = src0_read[pos * DILATION_X];
1477              else
1478                pblockA00[pos] = 0;
1479            })
1480            curr_y += DILATION_Y;
1481#endif
1482            src0_read += ROW_PITCH * DILATION_Y;
1483            INT_TYPE blockB00[KERNEL_WIDTH * 2];
1484            INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;
1485            INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;
1486            Dtype* pBlockB00  = (Dtype*)blockB00;
1487            interleaved_y = 0;
1488            LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1489            {
1490                p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );
1491                src1_read += WIDTH1 * 2;
1492            } )
1493            if ( kernel_width_is_odd )
1494            {
1495                p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );
1496                src1_read += WIDTH1 * 2;
1497            }
1498
1499            // Perform MADs
1500            kernel_idx = 0;
1501            interleaved_y = 0;
1502            LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1503            {
1504                kernel_y = interleaved_y * 2;
1505                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1506                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1507                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1508                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1509            } )
1510            if ( kernel_width_is_odd )
1511            {
1512                kernel_y = interleaved_y * 2;
1513                DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1514                DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1515            }
1516        }
1517
1518        //while( ++patch_row < 1 ); //debug
1519        while( ++patch_row < KERNEL_HEIGHT );
1520
1521        // reset to start of next slice of patch
1522        src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
1523    }
1524    //while ( ++patch_depth < 1 );  //debug
1525    while ( ++patch_depth < INPUT_DEPTH );
1526
1527    // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1528    // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1529    int out_offset = global_z * out_pitch_z                                        // batch offset
1530     + ( group_x * TILE_N ) * out_pitch_y                                          // channel offset
1531     + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X  // y offset
1532     + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1533    __global Dtype *out = dst + out_offset;
1534
1535#if APPLY_BIAS
1536    Dtype bias[2];
1537    Dtype2 *bias_vec;
1538    bias_vec = (Dtype2*)bias;
1539    *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1540    if (group_x > 0xFFFFFFFEul) {
1541      dst[0] = bias[0] + bias[1];
1542    }
1543#else
1544    const Dtype bias[2] = {0, 0};
1545#endif
1546    INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
1547}
1548#endif
1549
1550#ifdef GEMM_LIKE_CONV_32_2_SIMD16
1551
1552//////////////////////////////////////////////////////////////////////////////
1553// Conv_Interleaved_32_2_SIMD16
1554//
1555// Convolution: each workitem computes 1 patch x 32 filters worth of output
1556// data.
1557#define TILE_M          2
1558#define TILE_K          KERNEL_WIDTH
1559#define TILE_N          32
1560
1561__attribute__((intel_reqd_sub_group_size(16)))
1562__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
1563{
1564    __global Dtype *dst = dst_base + dst_offset;
1565    const int group_x = get_group_id(0);
1566    const int group_y = get_group_id(1);
1567    const int global_x = get_global_id(0);
1568    const int global_y = get_global_id(1);
1569    const int global_z = get_global_id(2);
1570    int interleaved_y;
1571    int kernel_y;
1572    int kernel_idx;
1573#define DOT_PRODUCT_16( _result, _rowA, colB )    \
1574    {   \
1575        _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 );  \
1576        _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 );  \
1577        _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 );  \
1578        _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 );  \
1579        _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 );  \
1580        _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 );  \
1581        _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 );  \
1582        _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 );  \
1583        _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 );  \
1584        _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 );  \
1585        _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa );  \
1586        _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb );  \
1587        _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc );  \
1588        _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd );  \
1589        _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se );  \
1590        _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf );  \
1591    }
1592        typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
1593
1594    // True for all threads if filter_width is multiple of TILE_N
1595    // else, true for all but right-most column of threads.
1596    {
1597        // Result ctile (*dst) is M rows x N columns
1598        // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
1599        Dtype16  blockC00 = 0.f;
1600        Dtype16  blockC10 = 0.f;
1601        Dtype16  blockC01 = 0.f;
1602        Dtype16  blockC11 = 0.f;
1603
1604        // Src0 (patch input) is directly used as atile.
1605        // Each work item points to the start of a different patch.
1606        // atile is M rows x K columns.
1607        int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
1608        int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
1609        int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
1610        int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
1611#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1612        int saved_y0 = curr_y0;
1613        int saved_y1 = curr_y1;
1614#endif
1615        const __global Dtype *src0_read0 = src0
1616         + aligned_input_size * global_z         // batch offset
1617         + (curr_y0 - INPUT_PAD_H) * ROW_PITCH   // y offset
1618         + curr_x0 - INPUT_PAD_W;                // x offset
1619        const __global Dtype *src0_read1 = src0
1620         + aligned_input_size * global_z         // batch offset
1621         + (curr_y1 - INPUT_PAD_H) * ROW_PITCH   // y offset
1622         + curr_x1 - INPUT_PAD_W;                // x offset
1623
1624        // Src1 (filter) is directly used as btile.
1625        // It starts at the top of src1 and walks down.
1626        // btile is K rows x N columns.
1627        const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
1628
1629        // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
1630        // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
1631        // and KERNEL_WIDTH/2 rows of interleaved filter.
1632        int patch_depth = 0;
1633        do
1634        {
1635            int patch_row = 0;
1636            do
1637            {
1638                // Load atile and btile.
1639                // Kernel data is partially interleaved.  Every 2 rows are interleaved at Dtype8 granularity.
1640                // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved.  The non
1641                // interleaved row is padded with zero to ensure same size as interleaved rows. This
1642                // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
1643                // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
1644                // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
1645                // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
1646                // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
1647                // ...
1648                const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
1649#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1 && INPUT_PAD_BOTTOM == 0 && INPUT_PAD_RIGHT == 0
1650                Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[  0  ]; src0_read0 += ROW_PITCH;
1651                Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[  0  ]; src0_read1 += ROW_PITCH;
1652                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1653                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1654#else
1655                Dtype_t blockA00;
1656                Dtype*  pblockA00 = (Dtype*)(&blockA00);
1657                int pos = 0;
1658                LOOP(KERNEL_WIDTH, pos,
1659                {
1660                  if (curr_y0 >= INPUT_PAD_H &&
1661                      curr_y0 < input_height + INPUT_PAD_H &&
1662                      curr_x0 + pos * DILATION_X >= INPUT_PAD_W &&
1663                      curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
1664                    pblockA00[pos] = src0_read0[pos * DILATION_X];
1665                  else
1666                    pblockA00[pos] = 0;
1667                })
1668                curr_y0 += DILATION_Y;
1669                Dtype_t blockA01;
1670                Dtype*  pblockA01 = (Dtype*)(&blockA01);
1671                pos = 0;
1672                LOOP(KERNEL_WIDTH, pos,
1673                {
1674                  if (curr_y1 >= INPUT_PAD_H &&
1675                      curr_y1 < input_height + INPUT_PAD_H &&
1676                      curr_x1 + pos * DILATION_X >= INPUT_PAD_W &&
1677                      curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
1678                    pblockA01[pos] = src0_read1[pos * DILATION_X];
1679                  else
1680                    pblockA01[pos] = 0;
1681                })
1682                curr_y1 += DILATION_Y;
1683                src0_read0 += (ROW_PITCH * DILATION_Y);
1684                src0_read1 += (ROW_PITCH * DILATION_Y);
1685#endif
1686                Dtype blockB00[KERNEL_WIDTH*2];
1687                Dtype4* p4BlockB00 = (Dtype4*)blockB00;
1688                Dtype2* p2BlockB00 = (Dtype2*)blockB00;
1689                Dtype*  pBlockB00 =  (Dtype* )blockB00;
1690
1691                interleaved_y = 0;
1692                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1693                {
1694                    p4BlockB00[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
1695                    src1_read += WIDTH1 * 2;
1696                } )
1697                if ( kernel_width_is_odd )
1698                {
1699                    p2BlockB00[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
1700                    src1_read += WIDTH1 * 2;
1701                }
1702                // Perform MADs
1703                kernel_idx = 0;
1704                interleaved_y = 0;
1705                LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
1706                {
1707                    kernel_y = interleaved_y * 2;
1708                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
1709                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1710                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1711                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1712                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
1713                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
1714                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
1715                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
1716                } )
1717                if ( kernel_width_is_odd )
1718                {
1719                    kernel_y = interleaved_y * 2;
1720                    DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1721                    DOT_PRODUCT_16( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1722                    DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
1723                    DOT_PRODUCT_16( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
1724                }
1725            }
1726
1727            //while( ++patch_row < 1 ); //debug
1728            while( ++patch_row < KERNEL_HEIGHT );
1729#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1 || INPUT_PAD_BOTTOM != 0 || INPUT_PAD_RIGHT != 0
1730            curr_y0 = saved_y0;
1731            curr_y1 = saved_y1;
1732#endif
1733            // reset to start of next slice of patch
1734            src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
1735            src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y);
1736        }
1737        //while ( ++patch_depth < 1 );  //debug
1738        while ( ++patch_depth < INPUT_DEPTH );
1739
1740        // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
1741        // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
1742        int out0_offset = global_z * out_pitch_z                                           // batch offset
1743         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1744         + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1745         + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1746        int out1_offset = global_z * out_pitch_z                                           // batch offset
1747         + ( group_x * TILE_N ) * out_pitch_y                                              // channel offset
1748         + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
1749         + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT;                // x offset
1750
1751#if APPLY_BIAS
1752        Dtype bias[2];
1753        Dtype2 *bias_vec;
1754        bias_vec = (Dtype2*)bias;
1755        *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
1756        if (group_x > 0xFFFFFFFEul) {
1757          dst[0] = bias[0] + bias[1];
1758        }
1759#else
1760        const Dtype bias[2] = {0, 0};
1761#endif
1762        INTERLEAVED_SIMD16_OUTPUT(dst, out0_offset, 0);
1763        INTERLEAVED_SIMD16_OUTPUT(dst, out1_offset, 1);
1764    }
1765}
1766#endif
1767
1768#elif defined KERNEL_DWCONV
1769
1770__kernel void DWCONV(
1771    ELTWISE_DATA_ARG
1772    FUSED_ARG
1773    __global Dtype* image_data,
1774    __global Dtype* kernel_data,
1775    BIAS_KERNEL_ARG
1776    __global Dtype* convolved_image_base,
1777    const int convolved_image_offset,
1778    const ushort input_width,
1779    const ushort input_height,
1780    const ushort output_width,
1781    const ushort output_height) {
1782  __global Dtype* convolved_image = convolved_image_base + convolved_image_offset;
1783  const int outputX = get_global_id(0);
1784  const int outputY = get_global_id(1);
1785  const int outputZ = get_global_id(2);
1786  if(outputX < output_width && outputY < output_height)
1787  {
1788    Dtype sum = 0.;
1789
1790    const int org_y = outputY * STRIDE_Y - INPUT_PAD_H;
1791    const int org_x = outputX * STRIDE_X - INPUT_PAD_W;
1792    const int currentKernelOffset = KERNEL_SIZE*(outputZ%CHANNELS);
1793    const int biasIndex=outputZ%CHANNELS;
1794    const int local_image_offset = org_y*input_width + org_x;
1795    const int imageSize = input_width*input_height;
1796
1797    __global Dtype* image_dataPtrFloat = (image_data + (imageSize*outputZ + local_image_offset));
1798    __global Dtype* kernel_dataPtrFloat = (kernel_data + (currentKernelOffset));
1799
1800    for(int y = 0; y < KERNEL_H; y++)
1801    {
1802      for(int x = 0; x < KERNEL_W; x++)
1803      {
1804        if(!(org_y + y * DILATION_Y >= 0 && org_y + y * DILATION_Y < input_height && org_x + x * DILATION_X >= 0 && org_x + x * DILATION_X < input_width))
1805        {
1806          continue;
1807        }
1808        sum += image_dataPtrFloat[x * DILATION_X] * kernel_dataPtrFloat[x];
1809      }
1810      image_dataPtrFloat += input_width * DILATION_Y;
1811      kernel_dataPtrFloat += KERNEL_W;
1812    }
1813
1814    #if APPLY_BIAS
1815    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1816    ACTIVATION_FUNCTION(convolved_image, offset, sum + biases_base[biasIndex], biasIndex);
1817    #else
1818    int offset = outputZ*output_height*output_width + outputY*output_width + outputX;
1819    ACTIVATION_FUNCTION(convolved_image, offset, sum, biasIndex);
1820    #endif
1821  }
1822}
1823#endif // KERNEL_BASIC/IDLF/GEMM_LIKE/DWCONV
1824