1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// @Authors
18//    Zhang Ying, zhangying913@gmail.com
19//
20// Redistribution and use in source and binary forms, with or without modification,
21// are permitted provided that the following conditions are met:
22//
23//   * Redistribution's of source code must retain the above copyright notice,
24//     this list of conditions and the following disclaimer.
25//
26//   * Redistribution's in binary form must reproduce the above copyright notice,
27//     this list of conditions and the following disclaimer in the documentation
28//     and/or other materials provided with the distribution.
29//
30//   * The name of the copyright holders may not be used to endorse or promote products
31//     derived from this software without specific prior written permission.
32//
33// This software is provided by the copyright holders and contributors as is and
34// any express or implied warranties, including, but not limited to, the implied
35// warranties of merchantability and fitness for a particular purpose are disclaimed.
36// In no event shall the Intel Corporation or contributors be liable for any direct,
37// indirect, incidental, special, exemplary, or consequential damages
38// (including, but not limited to, procurement of substitute goods or services;
39// loss of use, data, or profits; or business interruption) however caused
40// and on any theory of liability, whether in contract, strict liability,
41// or tort (including negligence or otherwise) arising in any way out of
42// the use of this software, even if advised of the possibility of such damage.
43//
44//M*/
45
46#ifdef DOUBLE_SUPPORT
47#ifdef cl_amd_fp64
48#pragma OPENCL EXTENSION cl_amd_fp64:enable
49#elif defined (cl_khr_fp64)
50#pragma OPENCL EXTENSION cl_khr_fp64:enable
51#endif
52#endif
53
54#define INTER_BITS 5
55#define INTER_TAB_SIZE (1 << INTER_BITS)
56#define INTER_SCALE 1.f/INTER_TAB_SIZE
57#define AB_BITS max(10, (int)INTER_BITS)
58#define AB_SCALE (1 << AB_BITS)
59#define INTER_REMAP_COEF_BITS 15
60#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
61#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1))
62
63#define noconvert
64
65#ifndef ST
66#define ST T
67#endif
68
69#if cn != 3
70#define loadpix(addr)  *(__global const T*)(addr)
71#define storepix(val, addr)  *(__global T*)(addr) = val
72#define scalar scalar_
73#define pixsize (int)sizeof(T)
74#else
75#define loadpix(addr)  vload3(0, (__global const T1*)(addr))
76#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
77#ifdef INTER_NEAREST
78#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
79#else
80#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
81#endif
82#define pixsize ((int)sizeof(T1)*3)
83#endif
84
85#ifdef INTER_NEAREST
86
87__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
88                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
89                         __constant CT * M, ST scalar_)
90{
91    int dx = get_global_id(0);
92    int dy0 = get_global_id(1) * rowsPerWI;
93
94    if (dx < dst_cols)
95    {
96        int round_delta = (AB_SCALE >> 1);
97
98        int X0_ = rint(M[0] * dx * AB_SCALE);
99        int Y0_ = rint(M[3] * dx * AB_SCALE);
100        int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset));
101
102        for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step)
103        {
104            int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + round_delta;
105            int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + round_delta;
106
107            short sx = convert_short_sat(X0 >> AB_BITS);
108            short sy = convert_short_sat(Y0 >> AB_BITS);
109
110            if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
111            {
112                int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset));
113                storepix(loadpix(srcptr + src_index), dstptr + dst_index);
114            }
115            else
116                storepix(scalar, dstptr + dst_index);
117        }
118    }
119}
120
121#elif defined INTER_LINEAR
122
123__constant float coeffs[64] =
124{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
125    0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
126    0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
127    0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
128    0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
129    0.062500f, 0.937500f, 0.031250f, 0.968750f };
130
131__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
132                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
133                         __constant CT * M, ST scalar_)
134{
135    int dx = get_global_id(0);
136    int dy0 = get_global_id(1) * rowsPerWI;
137
138    if (dx < dst_cols)
139    {
140        int tmp = dx << AB_BITS;
141        int X0_ = rint(M[0] * tmp);
142        int Y0_ = rint(M[3] * tmp);
143
144        for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)
145        {
146            int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA;
147            int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA;
148            X0 = X0 >> (AB_BITS - INTER_BITS);
149            Y0 = Y0 >> (AB_BITS - INTER_BITS);
150
151            short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);
152            short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
153
154#if defined AMD_DEVICE || depth > 4
155            WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
156            if (sx >= 0 && sx < src_cols)
157            {
158                if (sy >= 0 && sy < src_rows)
159                    v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));
160                if (sy+1 >= 0 && sy+1 < src_rows)
161                    v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));
162            }
163            if (sx+1 >= 0 && sx+1 < src_cols)
164            {
165                if (sy >= 0 && sy < src_rows)
166                    v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));
167                if (sy+1 >= 0 && sy+1 < src_rows)
168                    v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));
169            }
170
171            float taby = 1.f/INTER_TAB_SIZE*ay;
172            float tabx = 1.f/INTER_TAB_SIZE*ax;
173
174            int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
175
176#if depth <= 4
177            int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
178            int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
179            int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
180            int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
181
182            WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3)));
183            storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
184#else
185            float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
186            WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));
187            storepix(convertToT(val), dstptr + dst_index);
188#endif
189#else // INTEL_DEVICE
190            __constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);
191
192            int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
193            int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
194
195            WT sum = (WT)(0), xsum;
196            #pragma unroll
197            for (int y = 0; y < 2; y++)
198            {
199                src_index = mad24(y, src_step, src_index0);
200                if (sy + y >= 0 && sy + y < src_rows)
201                {
202                    xsum = (WT)(0);
203                    if (sx >= 0 && sx + 2 < src_cols)
204                    {
205#if depth == 0 && cn == 1
206                        uchar2 value = vload2(0, srcptr + src_index);
207                        xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
208#else
209                        #pragma unroll
210                        for (int x = 0; x < 2; x++)
211                            xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
212#endif
213                    }
214                    else
215                    {
216                        #pragma unroll
217                        for (int x = 0; x < 2; x++)
218                            xsum = fma(sx + x >= 0 && sx + x < src_cols ?
219                                       convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
220                    }
221                    sum = fma(xsum, coeffs_y[y], sum);
222                }
223                else
224                    sum = fma(scalar, coeffs_y[y], sum);
225            }
226
227            storepix(convertToT(sum), dstptr + dst_index);
228#endif
229        }
230    }
231}
232
233#elif defined INTER_CUBIC
234
235#ifdef AMD_DEVICE
236
237inline void interpolateCubic( float x, float* coeffs )
238{
239    const float A = -0.75f;
240
241    coeffs[0] = fma(fma(fma(A, (x + 1.f), - 5.0f*A), (x + 1.f), 8.0f*A), x + 1.f, - 4.0f*A);
242    coeffs[1] = fma(fma(A + 2.f, x, - (A + 3.f)), x*x, 1.f);
243    coeffs[2] = fma(fma(A + 2.f, 1.f - x, - (A + 3.f)), (1.f - x)*(1.f - x), 1.f);
244    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
245}
246
247#else
248
249__constant float coeffs[128] =
250    { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,
251    -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,
252    -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,
253    -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,
254    -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,
255    -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,
256    -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,
257    -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,
258    -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,
259    -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,
260    -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };
261
262#endif
263
264__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
265                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
266                         __constant CT * M, ST scalar_)
267{
268    int dx = get_global_id(0);
269    int dy = get_global_id(1);
270
271    if (dx < dst_cols && dy < dst_rows)
272    {
273        int tmp = (dx << AB_BITS);
274        int X0 = rint(M[0] * tmp) + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA;
275        int Y0 = rint(M[3] * tmp) + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA;
276
277        X0 = X0 >> (AB_BITS - INTER_BITS);
278        Y0 = Y0 >> (AB_BITS - INTER_BITS);
279
280        int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1;
281        int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1));
282
283#ifdef AMD_DEVICE
284        WT v[16];
285        #pragma unroll
286        for (int y = 0; y < 4; y++)
287        {
288            if (sy+y >= 0 && sy+y < src_rows)
289            {
290                #pragma unroll
291                for (int x = 0; x < 4; x++)
292                    v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ?
293                        convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;
294            }
295            else
296            {
297                #pragma unroll
298                for (int x = 0; x < 4; x++)
299                    v[mad24(y, 4, x)] = scalar;
300            }
301        }
302
303        float tab1y[4], tab1x[4];
304
305        float ayy = INTER_SCALE * ay;
306        float axx = INTER_SCALE * ax;
307        interpolateCubic(ayy, tab1y);
308        interpolateCubic(axx, tab1x);
309
310        int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
311
312        WT sum = (WT)(0);
313#if depth <= 4
314        int itab[16];
315
316        #pragma unroll
317        for (int i = 0; i < 16; i++)
318            itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE);
319
320        #pragma unroll
321        for (int i = 0; i < 16; i++)
322            sum = mad24(v[i], itab[i], sum);
323        storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
324#else
325        #pragma unroll
326        for (int i = 0; i < 16; i++)
327            sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);
328        storepix(convertToT( sum ), dstptr + dst_index);
329#endif
330#else // INTEL_DEVICE
331        __constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);
332
333        int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
334        int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
335
336        WT sum = (WT)(0), xsum;
337        #pragma unroll
338        for (int y = 0; y < 4; y++)
339        {
340            src_index = mad24(y, src_step, src_index0);
341            if (sy + y >= 0 && sy + y < src_rows)
342            {
343                xsum = (WT)(0);
344                if (sx >= 0 && sx + 4 < src_cols)
345                {
346#if depth == 0 && cn == 1
347                    uchar4 value = vload4(0, srcptr + src_index);
348                    xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
349#else
350                    #pragma unroll
351                    for (int x = 0; x < 4; x++)
352                        xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
353#endif
354                }
355                else
356                {
357                    #pragma unroll
358                    for (int x = 0; x < 4; x++)
359                        xsum = fma(sx + x >= 0 && sx + x < src_cols ?
360                                   convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
361                }
362                sum = fma(xsum, coeffs_y[y], sum);
363            }
364            else
365                sum = fma(scalar, coeffs_y[y], sum);
366        }
367
368        storepix(convertToT(sum), dstptr + dst_index);
369#endif
370    }
371}
372
373#endif
374