1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// @Authors 18// Zhang Ying, zhangying913@gmail.com 19// 20// Redistribution and use in source and binary forms, with or without modification, 21// are permitted provided that the following conditions are met: 22// 23// * Redistribution's of source code must retain the above copyright notice, 24// this list of conditions and the following disclaimer. 25// 26// * Redistribution's in binary form must reproduce the above copyright notice, 27// this list of conditions and the following disclaimer in the documentation 28// and/or other materials provided with the distribution. 29// 30// * The name of the copyright holders may not be used to endorse or promote products 31// derived from this software without specific prior written permission. 32// 33// This software is provided by the copyright holders and contributors as is and 34// any express or implied warranties, including, but not limited to, the implied 35// warranties of merchantability and fitness for a particular purpose are disclaimed. 36// In no event shall the Intel Corporation or contributors be liable for any direct, 37// indirect, incidental, special, exemplary, or consequential damages 38// (including, but not limited to, procurement of substitute goods or services; 39// loss of use, data, or profits; or business interruption) however caused 40// and on any theory of liability, whether in contract, strict liability, 41// or tort (including negligence or otherwise) arising in any way out of 42// the use of this software, even if advised of the possibility of such damage. 43// 44//M*/ 45 46#ifdef DOUBLE_SUPPORT 47#ifdef cl_amd_fp64 48#pragma OPENCL EXTENSION cl_amd_fp64:enable 49#elif defined (cl_khr_fp64) 50#pragma OPENCL EXTENSION cl_khr_fp64:enable 51#endif 52#endif 53 54#define INTER_BITS 5 55#define INTER_TAB_SIZE (1 << INTER_BITS) 56#define INTER_SCALE 1.f/INTER_TAB_SIZE 57#define AB_BITS max(10, (int)INTER_BITS) 58#define AB_SCALE (1 << AB_BITS) 59#define INTER_REMAP_COEF_BITS 15 60#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) 61#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1)) 62 63#define noconvert 64 65#ifndef ST 66#define ST T 67#endif 68 69#if cn != 3 70#define loadpix(addr) *(__global const T*)(addr) 71#define storepix(val, addr) *(__global T*)(addr) = val 72#define scalar scalar_ 73#define pixsize (int)sizeof(T) 74#else 75#define loadpix(addr) vload3(0, (__global const T1*)(addr)) 76#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr)) 77#ifdef INTER_NEAREST 78#define scalar (T)(scalar_.x, scalar_.y, scalar_.z) 79#else 80#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z) 81#endif 82#define pixsize ((int)sizeof(T1)*3) 83#endif 84 85#ifdef INTER_NEAREST 86 87__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, 88 __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 89 __constant CT * M, ST scalar_) 90{ 91 int dx = get_global_id(0); 92 int dy0 = get_global_id(1) * rowsPerWI; 93 94 if (dx < dst_cols) 95 { 96 int round_delta = (AB_SCALE >> 1); 97 98 int X0_ = rint(M[0] * dx * AB_SCALE); 99 int Y0_ = rint(M[3] * dx * AB_SCALE); 100 int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset)); 101 102 for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step) 103 { 104 int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + round_delta; 105 int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + round_delta; 106 107 short sx = convert_short_sat(X0 >> AB_BITS); 108 short sy = convert_short_sat(Y0 >> AB_BITS); 109 110 if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) 111 { 112 int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset)); 113 storepix(loadpix(srcptr + src_index), dstptr + dst_index); 114 } 115 else 116 storepix(scalar, dstptr + dst_index); 117 } 118 } 119} 120 121#elif defined INTER_LINEAR 122 123__constant float coeffs[64] = 124{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f, 125 0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f, 126 0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f, 127 0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f, 128 0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f, 129 0.062500f, 0.937500f, 0.031250f, 0.968750f }; 130 131__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, 132 __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 133 __constant CT * M, ST scalar_) 134{ 135 int dx = get_global_id(0); 136 int dy0 = get_global_id(1) * rowsPerWI; 137 138 if (dx < dst_cols) 139 { 140 int tmp = dx << AB_BITS; 141 int X0_ = rint(M[0] * tmp); 142 int Y0_ = rint(M[3] * tmp); 143 144 for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy) 145 { 146 int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA; 147 int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA; 148 X0 = X0 >> (AB_BITS - INTER_BITS); 149 Y0 = Y0 >> (AB_BITS - INTER_BITS); 150 151 short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS); 152 short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1)); 153 154#if defined AMD_DEVICE || depth > 4 155 WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar; 156 if (sx >= 0 && sx < src_cols) 157 { 158 if (sy >= 0 && sy < src_rows) 159 v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset)))); 160 if (sy+1 >= 0 && sy+1 < src_rows) 161 v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset)))); 162 } 163 if (sx+1 >= 0 && sx+1 < src_cols) 164 { 165 if (sy >= 0 && sy < src_rows) 166 v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset)))); 167 if (sy+1 >= 0 && sy+1 < src_rows) 168 v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset)))); 169 } 170 171 float taby = 1.f/INTER_TAB_SIZE*ay; 172 float tabx = 1.f/INTER_TAB_SIZE*ax; 173 174 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 175 176#if depth <= 4 177 int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); 178 int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ); 179 int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); 180 int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE ); 181 182 WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3))); 183 storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index); 184#else 185 float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby; 186 WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby)); 187 storepix(convertToT(val), dstptr + dst_index); 188#endif 189#else // INTEL_DEVICE 190 __constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1); 191 192 int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index; 193 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 194 195 WT sum = (WT)(0), xsum; 196 #pragma unroll 197 for (int y = 0; y < 2; y++) 198 { 199 src_index = mad24(y, src_step, src_index0); 200 if (sy + y >= 0 && sy + y < src_rows) 201 { 202 xsum = (WT)(0); 203 if (sx >= 0 && sx + 2 < src_cols) 204 { 205#if depth == 0 && cn == 1 206 uchar2 value = vload2(0, srcptr + src_index); 207 xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1])); 208#else 209 #pragma unroll 210 for (int x = 0; x < 2; x++) 211 xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum); 212#endif 213 } 214 else 215 { 216 #pragma unroll 217 for (int x = 0; x < 2; x++) 218 xsum = fma(sx + x >= 0 && sx + x < src_cols ? 219 convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum); 220 } 221 sum = fma(xsum, coeffs_y[y], sum); 222 } 223 else 224 sum = fma(scalar, coeffs_y[y], sum); 225 } 226 227 storepix(convertToT(sum), dstptr + dst_index); 228#endif 229 } 230 } 231} 232 233#elif defined INTER_CUBIC 234 235#ifdef AMD_DEVICE 236 237inline void interpolateCubic( float x, float* coeffs ) 238{ 239 const float A = -0.75f; 240 241 coeffs[0] = fma(fma(fma(A, (x + 1.f), - 5.0f*A), (x + 1.f), 8.0f*A), x + 1.f, - 4.0f*A); 242 coeffs[1] = fma(fma(A + 2.f, x, - (A + 3.f)), x*x, 1.f); 243 coeffs[2] = fma(fma(A + 2.f, 1.f - x, - (A + 3.f)), (1.f - x)*(1.f - x), 1.f); 244 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; 245} 246 247#else 248 249__constant float coeffs[128] = 250 { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f, 251 -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f, 252 -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f, 253 -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f, 254 -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f, 255 -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f, 256 -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f, 257 -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f, 258 -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f, 259 -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f, 260 -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f }; 261 262#endif 263 264__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, 265 __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 266 __constant CT * M, ST scalar_) 267{ 268 int dx = get_global_id(0); 269 int dy = get_global_id(1); 270 271 if (dx < dst_cols && dy < dst_rows) 272 { 273 int tmp = (dx << AB_BITS); 274 int X0 = rint(M[0] * tmp) + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA; 275 int Y0 = rint(M[3] * tmp) + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA; 276 277 X0 = X0 >> (AB_BITS - INTER_BITS); 278 Y0 = Y0 >> (AB_BITS - INTER_BITS); 279 280 int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1; 281 int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1)); 282 283#ifdef AMD_DEVICE 284 WT v[16]; 285 #pragma unroll 286 for (int y = 0; y < 4; y++) 287 { 288 if (sy+y >= 0 && sy+y < src_rows) 289 { 290 #pragma unroll 291 for (int x = 0; x < 4; x++) 292 v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ? 293 convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar; 294 } 295 else 296 { 297 #pragma unroll 298 for (int x = 0; x < 4; x++) 299 v[mad24(y, 4, x)] = scalar; 300 } 301 } 302 303 float tab1y[4], tab1x[4]; 304 305 float ayy = INTER_SCALE * ay; 306 float axx = INTER_SCALE * ax; 307 interpolateCubic(ayy, tab1y); 308 interpolateCubic(axx, tab1x); 309 310 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 311 312 WT sum = (WT)(0); 313#if depth <= 4 314 int itab[16]; 315 316 #pragma unroll 317 for (int i = 0; i < 16; i++) 318 itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE); 319 320 #pragma unroll 321 for (int i = 0; i < 16; i++) 322 sum = mad24(v[i], itab[i], sum); 323 storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index); 324#else 325 #pragma unroll 326 for (int i = 0; i < 16; i++) 327 sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum); 328 storepix(convertToT( sum ), dstptr + dst_index); 329#endif 330#else // INTEL_DEVICE 331 __constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2); 332 333 int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index; 334 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 335 336 WT sum = (WT)(0), xsum; 337 #pragma unroll 338 for (int y = 0; y < 4; y++) 339 { 340 src_index = mad24(y, src_step, src_index0); 341 if (sy + y >= 0 && sy + y < src_rows) 342 { 343 xsum = (WT)(0); 344 if (sx >= 0 && sx + 4 < src_cols) 345 { 346#if depth == 0 && cn == 1 347 uchar4 value = vload4(0, srcptr + src_index); 348 xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 349#else 350 #pragma unroll 351 for (int x = 0; x < 4; x++) 352 xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum); 353#endif 354 } 355 else 356 { 357 #pragma unroll 358 for (int x = 0; x < 4; x++) 359 xsum = fma(sx + x >= 0 && sx + x < src_cols ? 360 convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum); 361 } 362 sum = fma(xsum, coeffs_y[y], sum); 363 } 364 else 365 sum = fma(scalar, coeffs_y[y], sum); 366 } 367 368 storepix(convertToT(sum), dstptr + dst_index); 369#endif 370 } 371} 372 373#endif 374