1// This file is part of OpenCV project. 2// It is subject to the license terms in the LICENSE file found in the top-level directory 3// of this distribution and at http://opencv.org/license.html. 4 5__constant short4 vec_offset = (short4)(0, 1, 2, 3); 6 7#define GET_VAL(x, y) ((x) < 0 || (x) >= src_cols || (y) < 0 || (y) >= src_rows) ? scalar : src[src_offset + y * src_step + x] 8 9__kernel void warpAffine_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 10 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 11 __constant float * M, ST scalar_) 12{ 13 int x = get_global_id(0) * 4; 14 int y = get_global_id(1); 15 uchar scalar = convert_uchar_sat_rte(scalar_); 16 17 if (x >= dst_cols || y >= dst_rows) return; 18 19 /* { M0, M1, M2 } 20 * { M3, M4, M5 } 21 */ 22 23 short4 new_x, new_y; 24 new_x = convert_short4_sat_rte(M[0] * convert_float4(vec_offset + (short4)(x)) + 25 M[1] * convert_float4((short4)y) + M[2]); 26 27 new_y = convert_short4_sat_rte(M[3] * convert_float4(vec_offset + (short4)(x)) + 28 M[4] * convert_float4((short4)y) + M[5]); 29 30 uchar4 pix = (uchar4)scalar; 31 32 pix.s0 = GET_VAL(new_x.s0, new_y.s0); 33 pix.s1 = GET_VAL(new_x.s1, new_y.s1); 34 pix.s2 = GET_VAL(new_x.s2, new_y.s2); 35 pix.s3 = GET_VAL(new_x.s3, new_y.s3); 36 37 int dst_index = x + y * dst_step + dst_offset; 38 39 vstore4(pix, 0, dst + dst_index); 40} 41 42uchar4 read_pixels(__global const uchar * src, short tx, short ty, 43 int src_offset, int src_step, int src_cols, int 44 src_rows, uchar scalar) 45{ 46 uchar2 pt, pb; 47 short bx, by; 48 49 bx = tx + 1; 50 by = ty + 1; 51 52 if (tx >= 0 && (tx + 1) < src_cols && ty >= 0 && ty < src_rows) 53 { 54 pt = vload2(0, src + src_offset + ty * src_step + tx); 55 } 56 else 57 { 58 pt.s0 = GET_VAL(tx, ty); 59 pt.s1 = GET_VAL(bx, ty); 60 } 61 62 if (tx >= 0 && (tx + 1) < src_cols && by >= 0 && by < src_rows) 63 { 64 pb = vload2(0, src + src_offset + by * src_step + tx); 65 } 66 else 67 { 68 pb.s0 = GET_VAL(tx, by); 69 pb.s1 = GET_VAL(bx, by); 70 } 71 72 return (uchar4)(pt, pb); 73} 74 75__kernel void warpAffine_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 76 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 77 __constant float * M, ST scalar_) 78{ 79 int x = get_global_id(0) * 4; 80 int y = get_global_id(1); 81 uchar scalar = convert_uchar_sat_rte(scalar_); 82 83 if (x >= dst_cols || y >= dst_rows) return; 84 85 /* { M0, M1, M2 } 86 * { M3, M4, M5 } 87 */ 88 89 float4 nx, ny; 90 nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2]; 91 ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5]; 92 93 float4 s, t; 94 s = round((nx - floor(nx)) * 32.0f) / 32.0f; 95 t = round((ny - floor(ny)) * 32.0f) / 32.0f; 96 97 short4 tx, ty; 98 tx = convert_short4_sat_rtn(nx); 99 ty = convert_short4_sat_rtn(ny); 100 101 uchar4 pix[4]; 102 pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar); 103 pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar); 104 pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar); 105 pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar); 106 107 float4 tl, tr, bl, br; 108 tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0)); 109 tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1)); 110 bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2)); 111 br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3)); 112 113 float4 pixel; 114 pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t; 115 116 int dst_index = x + y * dst_step + dst_offset; 117 vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index); 118} 119 120__constant float coeffs[128] = 121 { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f, 122 -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f, 123 -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f, 124 -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f, 125 -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f, 126 -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f, 127 -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f, 128 -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f, 129 -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f, 130 -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f, 131 -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f }; 132 133uchar4 read_pixels_cubic(__global const uchar * src, int tx, int ty, 134 int src_offset, int src_step, int src_cols, int src_rows, uchar scalar) 135{ 136 uchar4 pix; 137 138 if (tx >= 0 && (tx + 3) < src_cols && ty >= 0 && ty < src_rows) 139 { 140 pix = vload4(0, src + src_offset + ty * src_step + tx); 141 } 142 else 143 { 144 pix.s0 = GET_VAL((tx + 0), ty); 145 pix.s1 = GET_VAL((tx + 1), ty); 146 pix.s2 = GET_VAL((tx + 2), ty); 147 pix.s3 = GET_VAL((tx + 3), ty); 148 } 149 150 return pix; 151} 152 153__kernel void warpAffine_cubic_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 154 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 155 __constant float * M, ST scalar_) 156{ 157 int x = get_global_id(0) * 4; 158 int y = get_global_id(1); 159 uchar scalar = convert_uchar_sat_rte(scalar_); 160 161 if (x >= dst_cols || y >= dst_rows) return; 162 163 /* { M0, M1, M2 } 164 * { M3, M4, M5 } 165 */ 166 167 float4 nx, ny; 168 nx = M[0] * convert_float4((vec_offset + (short4)x)) + M[1] * convert_float4((short4)y) + M[2]; 169 ny = M[3] * convert_float4((vec_offset + (short4)x)) + M[4] * convert_float4((short4)y) + M[5]; 170 171 int4 ax, ay; 172 ax = convert_int4_sat_rte((nx - floor(nx)) * 32.0f) & 31; 173 ay = convert_int4_sat_rte((ny - floor(ny)) * 32.0f) & 31; 174 175 int4 tx, ty; 176 int4 delta_x, delta_y; 177 178 delta_x = select((int4)1, (int4)0, ((nx - floor(nx))) * 64 > 63); 179 delta_y = select((int4)1, (int4)0, ((ny - floor(ny))) * 64 > 63); 180 181 tx = convert_int4_sat_rtn(nx) - delta_x; 182 ty = convert_int4_sat_rtn(ny) - delta_y; 183 184 __constant float * coeffs_x, * coeffs_y; 185 float4 sum = (float4)0.0f; 186 uchar4 pix; 187 float xsum; 188 189 coeffs_x = coeffs + (ax.s0 << 2); 190 coeffs_y = coeffs + (ay.s0 << 2); 191 for (int i = 0; i < 4; i++) 192 { 193 pix = read_pixels_cubic(src, tx.s0, ty.s0 + i, src_offset, src_step, src_cols, src_rows, scalar); 194 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 195 sum.s0 = fma(xsum, coeffs_y[i], sum.s0); 196 } 197 198 coeffs_x = coeffs + (ax.s1 << 2); 199 coeffs_y = coeffs + (ay.s1 << 2); 200 for (int i = 0; i < 4; i++) 201 { 202 pix = read_pixels_cubic(src, tx.s1, ty.s1 + i, src_offset, src_step, src_cols, src_rows, scalar); 203 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 204 sum.s1 = fma(xsum, coeffs_y[i], sum.s1); 205 } 206 207 coeffs_x = coeffs + (ax.s2 << 2); 208 coeffs_y = coeffs + (ay.s2 << 2); 209 for (int i = 0; i < 4; i++) 210 { 211 pix = read_pixels_cubic(src, tx.s2, ty.s2 + i, src_offset, src_step, src_cols, src_rows, scalar); 212 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 213 sum.s2 = fma(xsum, coeffs_y[i], sum.s2); 214 } 215 216 coeffs_x = coeffs + (ax.s3 << 2); 217 coeffs_y = coeffs + (ay.s3 << 2); 218 for (int i = 0; i < 4; i++) 219 { 220 pix = read_pixels_cubic(src, tx.s3, ty.s3 + i, src_offset, src_step, src_cols, src_rows, scalar); 221 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 222 sum.s3 = fma(xsum, coeffs_y[i], sum.s3); 223 } 224 225 int dst_index = x + y * dst_step + dst_offset; 226 vstore4(convert_uchar4_sat_rte(sum), 0, dst + dst_index); 227} 228 229__kernel void warpPerspective_nearest_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 230 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 231 __constant float * M, ST scalar_) 232{ 233 int x = get_global_id(0) * 4; 234 int y = get_global_id(1); 235 uchar scalar = convert_uchar_sat_rte(scalar_); 236 237 if (x >= dst_cols || y >= dst_rows) return; 238 239 /* { M0, M1, M2 } 240 * { M3, M4, M5 } 241 * { M6, M7, M8 } 242 */ 243 244 float4 nx, ny, nz; 245 nx = M[0] * convert_float4(vec_offset + (short4)(x)) + 246 M[1] * convert_float4((short4)y) + M[2]; 247 248 ny = M[3] * convert_float4(vec_offset + (short4)(x)) + 249 M[4] * convert_float4((short4)y) + M[5]; 250 251 nz = M[6] * convert_float4(vec_offset + (short4)(x)) + 252 M[7] * convert_float4((short4)y) + M[8]; 253 254 short4 new_x, new_y; 255 float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f); 256 new_x = convert_short4_sat_rte(nx * fz); 257 new_y = convert_short4_sat_rte(ny * fz); 258 259 uchar4 pix = (uchar4)scalar; 260 261 pix.s0 = GET_VAL(new_x.s0, new_y.s0); 262 pix.s1 = GET_VAL(new_x.s1, new_y.s1); 263 pix.s2 = GET_VAL(new_x.s2, new_y.s2); 264 pix.s3 = GET_VAL(new_x.s3, new_y.s3); 265 266 int dst_index = x + y * dst_step + dst_offset; 267 268 vstore4(pix, 0, dst + dst_index); 269} 270 271__kernel void warpPerspective_linear_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 272 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 273 __constant float * M, ST scalar_) 274{ 275 int x = get_global_id(0) * 4; 276 int y = get_global_id(1); 277 uchar scalar = convert_uchar_sat_rte(scalar_); 278 279 if (x >= dst_cols || y >= dst_rows) return; 280 281 /* { M0, M1, M2 } 282 * { M3, M4, M5 } 283 * { M6, M7, M8 } 284 */ 285 286 float4 nx, ny, nz; 287 nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2]; 288 289 ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5]; 290 291 nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8]; 292 293 float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f); 294 295 nx = nx * fz; 296 ny = ny * fz; 297 298 float4 s, t; 299 s = round((nx - floor(nx)) * 32.0f) / (float4)32.0f; 300 t = round((ny - floor(ny)) * 32.0f) / (float4)32.0f; 301 302 short4 tx, ty; 303 tx = convert_short4_sat_rtn(nx); 304 ty = convert_short4_sat_rtn(ny); 305 306 uchar4 pix[4]; 307 pix[0] = read_pixels(src, tx.s0, ty.s0, src_offset, src_step, src_cols, src_rows, scalar); 308 pix[1] = read_pixels(src, tx.s1, ty.s1, src_offset, src_step, src_cols, src_rows, scalar); 309 pix[2] = read_pixels(src, tx.s2, ty.s2, src_offset, src_step, src_cols, src_rows, scalar); 310 pix[3] = read_pixels(src, tx.s3, ty.s3, src_offset, src_step, src_cols, src_rows, scalar); 311 312 float4 tl, tr, bl, br; 313 tl = convert_float4((uchar4)(pix[0].s0, pix[1].s0, pix[2].s0, pix[3].s0)); 314 tr = convert_float4((uchar4)(pix[0].s1, pix[1].s1, pix[2].s1, pix[3].s1)); 315 bl = convert_float4((uchar4)(pix[0].s2, pix[1].s2, pix[2].s2, pix[3].s2)); 316 br = convert_float4((uchar4)(pix[0].s3, pix[1].s3, pix[2].s3, pix[3].s3)); 317 318 float4 pixel; 319 pixel = tl * (1 - s) * (1 - t) + tr * s * (1 - t) + bl * (1 - s) * t + br * s * t; 320 321 int dst_index = x + y * dst_step + dst_offset; 322 vstore4(convert_uchar4_sat_rte(pixel), 0, dst + dst_index); 323} 324 325__kernel void warpPerspective_cubic_8u(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 326 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 327 __constant float * M, ST scalar_) 328{ 329 int x = get_global_id(0) * 4; 330 int y = get_global_id(1); 331 uchar scalar = convert_uchar_sat_rte(scalar_); 332 333 if (x >= dst_cols || y >= dst_rows) return; 334 335 /* { M0, M1, M2 } 336 * { M3, M4, M5 } 337 * { M6, M7, M8 } 338 */ 339 340 float4 nx, ny, nz; 341 nx = M[0] * convert_float4(vec_offset + (short4)(x)) + M[1] * convert_float4((short4)y) + M[2]; 342 343 ny = M[3] * convert_float4(vec_offset + (short4)(x)) + M[4] * convert_float4((short4)y) + M[5]; 344 345 nz = M[6] * convert_float4(vec_offset + (short4)(x)) + M[7] * convert_float4((short4)y) + M[8]; 346 347 float4 fz = select((float4)(0.0f), (float4)(1.0f / nz), nz != 0.0f); 348 349 nx = nx * fz; 350 ny = ny * fz; 351 352 int4 ax, ay; 353 ax = convert_int4_sat_rte((nx - floor(nx)) * 32.0f) & 31; 354 ay = convert_int4_sat_rte((ny - floor(ny)) * 32.0f) & 31; 355 356 int4 tx, ty; 357 int4 delta_x, delta_y; 358 359 delta_x = select((int4)1, (int4)0, ((nx - floor(nx))) * 64 > 63); 360 delta_y = select((int4)1, (int4)0, ((ny - floor(ny))) * 64 > 63); 361 362 tx = convert_int4_sat_rtn(nx) - delta_x; 363 ty = convert_int4_sat_rtn(ny) - delta_y; 364 365 __constant float * coeffs_x, * coeffs_y; 366 float4 sum = (float4)0.0f; 367 uchar4 pix; 368 float xsum; 369 370 coeffs_x = coeffs + (ax.s0 << 2); 371 coeffs_y = coeffs + (ay.s0 << 2); 372 for (int i = 0; i < 4; i++) 373 { 374 pix = read_pixels_cubic(src, tx.s0, ty.s0 + i, src_offset, src_step, src_cols, src_rows, scalar); 375 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 376 sum.s0 = fma(xsum, coeffs_y[i], sum.s0); 377 } 378 379 coeffs_x = coeffs + (ax.s1 << 2); 380 coeffs_y = coeffs + (ay.s1 << 2); 381 for (int i = 0; i < 4; i++) 382 { 383 pix = read_pixels_cubic(src, tx.s1, ty.s1 + i, src_offset, src_step, src_cols, src_rows, scalar); 384 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 385 sum.s1 = fma(xsum, coeffs_y[i], sum.s1); 386 } 387 388 coeffs_x = coeffs + (ax.s2 << 2); 389 coeffs_y = coeffs + (ay.s2 << 2); 390 for (int i = 0; i < 4; i++) 391 { 392 pix = read_pixels_cubic(src, tx.s2, ty.s2 + i, src_offset, src_step, src_cols, src_rows, scalar); 393 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 394 sum.s2 = fma(xsum, coeffs_y[i], sum.s2); 395 } 396 397 coeffs_x = coeffs + (ax.s3 << 2); 398 coeffs_y = coeffs + (ay.s3 << 2); 399 for (int i = 0; i < 4; i++) 400 { 401 pix = read_pixels_cubic(src, tx.s3, ty.s3 + i, src_offset, src_step, src_cols, src_rows, scalar); 402 xsum = dot(convert_float4(pix), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 403 sum.s3 = fma(xsum, coeffs_y[i], sum.s3); 404 } 405 406 int dst_index = x + y * dst_step + dst_offset; 407 vstore4(convert_uchar4_sat_rte(sum), 0, dst + dst_index); 408} 409