1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// @Authors
18//    Dachuan Zhao, dachuan@multicorewareinc.com
19//
20// Redistribution and use in source and binary forms, with or without modification,
21// are permitted provided that the following conditions are met:
22//
23//   * Redistribution's of source code must retain the above copyright notice,
24//     this list of conditions and the following disclaimer.
25//
26//   * Redistribution's in binary form must reproduce the above copyright notice,
27//     this list of conditions and the following disclaimer in the documentation
28//     and/or other materials provided with the distribution.
29//
30//   * The name of the copyright holders may not be used to endorse or promote products
31//     derived from this software without specific prior written permission.
32//
33// This software is provided by the copyright holders and contributors as is and
34// any express or implied warranties, including, but not limited to, the implied
35// warranties of merchantability and fitness for a particular purpose are disclaimed.
36// In no event shall the Intel Corporation or contributors be liable for any direct,
37// indirect, incidental, special, exemplary, or consequential damages
38// (including, but not limited to, procurement of substitute goods or services;
39// loss of use, data, or profits; or business interruption) however caused
40// and on any theory of liability, whether in contract, strict liability,
41// or tort (including negligence or otherwise) arising in any way out of
42// the use of this software, even if advised of the possibility of such damage.
43//
44//M*/
45
46#ifdef DOUBLE_SUPPORT
47#ifdef cl_amd_fp64
48#pragma OPENCL EXTENSION cl_amd_fp64:enable
49#elif defined (cl_khr_fp64)
50#pragma OPENCL EXTENSION cl_khr_fp64:enable
51#endif
52#endif
53
54#if defined BORDER_REPLICATE
55// aaaaaa|abcdefgh|hhhhhhh
56#define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1)
57#elif defined BORDER_WRAP
58// cdefgh|abcdefgh|abcdefg
59#define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV)
60#elif defined BORDER_REFLECT
61// fedcba|abcdefgh|hgfedcb
62#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1)
63#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
64// gfedcb|abcdefgh|gfedcba
65#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1)
66#else
67#error No extrapolation method
68#endif
69
70#if cn != 3
71#define loadpix(addr)  *(__global const T*)(addr)
72#define storepix(val, addr)  *(__global T*)(addr) = (val)
73#define PIXSIZE ((int)sizeof(T))
74#else
75#define loadpix(addr)  vload3(0, (__global const T1*)(addr))
76#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))
77#define PIXSIZE ((int)sizeof(T1)*3)
78#endif
79
80#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
81
82#if kercn == 4
83#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))
84#endif
85
86#ifdef INTEL_DEVICE
87#define MAD(x,y,z) fma((x),(y),(z))
88#else
89#define MAD(x,y,z) mad((x),(y),(z))
90#endif
91
92#define LOAD_LOCAL(col_gl, col_lcl) \
93    sum0 =     co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));         \
94    sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0);  \
95    temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows));                      \
96    sum0 = MAD(co1, temp, sum0);                                            \
97    sum1 = co3 * temp;                                                      \
98    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows));                  \
99    sum0 = MAD(co2, temp, sum0);                                            \
100    sum1 = MAD(co2, temp, sum1);                                            \
101    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows));                  \
102    sum0 = MAD(co3, temp, sum0);                                            \
103    sum1 = MAD(co1, temp, sum1);                                            \
104    smem[0][col_lcl] = sum0;                                                \
105    sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1);  \
106    sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1);  \
107    smem[1][col_lcl] = sum1;
108
109
110#if kercn == 4
111#define LOAD_LOCAL4(col_gl, col_lcl) \
112    sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \
113    sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \
114    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y, src_rows));                       \
115    sum40 = MAD(co1, temp4, sum40);                                             \
116    sum41 = co3 * temp4;                                                        \
117    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 1, src_rows));                   \
118    sum40 = MAD(co2, temp4, sum40);                                             \
119    sum41 = MAD(co2, temp4, sum41);                                             \
120    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 2, src_rows));                   \
121    sum40 = MAD(co3, temp4, sum40);                                             \
122    sum41 = MAD(co1, temp4, sum41);                                             \
123    vstore4(sum40, col_lcl, (__local float*) &smem[0][2]);                      \
124    sum41 = MAD(co2, SRC4(col_gl,  EXTRAPOLATE_(src_y + 3, src_rows)), sum41);  \
125    sum41 = MAD(co3, SRC4(col_gl,  EXTRAPOLATE_(src_y + 4, src_rows)), sum41);  \
126    vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);
127#endif
128
129#define noconvert
130
131__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
132                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
133{
134    const int x = get_global_id(0)*kercn;
135    const int y = 2*get_global_id(1);
136
137    __local FT smem[2][LOCAL_SIZE + 4];
138    __global uchar * dstData = dst + dst_offset;
139    __global const uchar * srcData = src + src_offset;
140
141    FT sum0, sum1, temp;
142    FT co1 = 0.375f;
143    FT co2 = 0.25f;
144    FT co3 = 0.0625f;
145
146    const int src_y = 2*y;
147    int col;
148
149    if (src_y >= 2 && src_y < src_rows - 4)
150    {
151#undef EXTRAPOLATE_
152#define EXTRAPOLATE_(val, maxVal)   val
153#if kercn == 1
154        col = EXTRAPOLATE(x, src_cols);
155        LOAD_LOCAL(col, 2 + get_local_id(0))
156#else
157        if (x < src_cols-4)
158        {
159            float4 sum40, sum41, temp4;
160            LOAD_LOCAL4(x, get_local_id(0))
161        }
162        else
163        {
164            for (int i=0; i<4; i++)
165            {
166                col = EXTRAPOLATE(x+i, src_cols);
167                LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)
168            }
169        }
170#endif
171        if (get_local_id(0) < 2)
172        {
173            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
174            LOAD_LOCAL(col, get_local_id(0))
175        }
176        else if (get_local_id(0) < 4)
177        {
178            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
179            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
180        }
181    }
182    else // need extrapolate y
183    {
184#undef EXTRAPOLATE_
185#define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)
186#if kercn == 1
187        col = EXTRAPOLATE(x, src_cols);
188        LOAD_LOCAL(col, 2 + get_local_id(0))
189#else
190        if (x < src_cols-4)
191        {
192            float4 sum40, sum41, temp4;
193            LOAD_LOCAL4(x, get_local_id(0))
194        }
195        else
196        {
197            for (int i=0; i<4; i++)
198            {
199                col = EXTRAPOLATE(x+i, src_cols);
200                LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)
201            }
202        }
203#endif
204        if (get_local_id(0) < 2)
205        {
206            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
207            LOAD_LOCAL(col, get_local_id(0))
208        }
209        else if (get_local_id(0) < 4)
210        {
211            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
212            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
213        }
214    }
215
216    barrier(CLK_LOCAL_MEM_FENCE);
217
218#if kercn == 1
219    if (get_local_id(0) < LOCAL_SIZE / 2)
220    {
221        const int tid2 = get_local_id(0) * 2;
222
223        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
224
225        if (dst_x < dst_cols)
226        {
227            for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
228            {
229#if cn == 1
230#if fdepth <= 5
231                FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
232#else
233                FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
234#endif
235#else
236                FT sum = co3 * smem[yin - y][2 + tid2 - 2];
237                sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);
238                sum = MAD(co1, smem[yin - y][2 + tid2    ], sum);
239                sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
240#endif
241                sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
242                storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
243            }
244        }
245    }
246#else
247    int tid4 = get_local_id(0) * 4;
248    int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
249    if (dst_x < dst_cols - 1)
250    {
251        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
252        {
253
254            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
255            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
256            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
257            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
258            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
259            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
260
261            dst_x ++;
262            sum =     co3* smem[yin - y][2 + tid4 + 4];
263            sum = MAD(co3, smem[yin - y][2 + tid4    ], sum);
264            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
265            sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
266            sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
267            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
268            dst_x --;
269        }
270
271    }
272    else if (dst_x < dst_cols)
273    {
274        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
275        {
276            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
277            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
278            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
279            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
280            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
281
282            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
283        }
284    }
285#endif
286
287}
288