1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// @Authors 18// Dachuan Zhao, dachuan@multicorewareinc.com 19// 20// Redistribution and use in source and binary forms, with or without modification, 21// are permitted provided that the following conditions are met: 22// 23// * Redistribution's of source code must retain the above copyright notice, 24// this list of conditions and the following disclaimer. 25// 26// * Redistribution's in binary form must reproduce the above copyright notice, 27// this list of conditions and the following disclaimer in the documentation 28// and/or other materials provided with the distribution. 29// 30// * The name of the copyright holders may not be used to endorse or promote products 31// derived from this software without specific prior written permission. 32// 33// This software is provided by the copyright holders and contributors as is and 34// any express or implied warranties, including, but not limited to, the implied 35// warranties of merchantability and fitness for a particular purpose are disclaimed. 36// In no event shall the Intel Corporation or contributors be liable for any direct, 37// indirect, incidental, special, exemplary, or consequential damages 38// (including, but not limited to, procurement of substitute goods or services; 39// loss of use, data, or profits; or business interruption) however caused 40// and on any theory of liability, whether in contract, strict liability, 41// or tort (including negligence or otherwise) arising in any way out of 42// the use of this software, even if advised of the possibility of such damage. 43// 44//M*/ 45 46#ifdef DOUBLE_SUPPORT 47#ifdef cl_amd_fp64 48#pragma OPENCL EXTENSION cl_amd_fp64:enable 49#elif defined (cl_khr_fp64) 50#pragma OPENCL EXTENSION cl_khr_fp64:enable 51#endif 52#endif 53 54#if defined BORDER_REPLICATE 55// aaaaaa|abcdefgh|hhhhhhh 56#define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1) 57#elif defined BORDER_WRAP 58// cdefgh|abcdefgh|abcdefg 59#define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV) 60#elif defined BORDER_REFLECT 61// fedcba|abcdefgh|hgfedcb 62#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1) 63#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101 64// gfedcb|abcdefgh|gfedcba 65#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1) 66#else 67#error No extrapolation method 68#endif 69 70#if cn != 3 71#define loadpix(addr) *(__global const T*)(addr) 72#define storepix(val, addr) *(__global T*)(addr) = (val) 73#define PIXSIZE ((int)sizeof(T)) 74#else 75#define loadpix(addr) vload3(0, (__global const T1*)(addr)) 76#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr)) 77#define PIXSIZE ((int)sizeof(T1)*3) 78#endif 79 80#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x))) 81 82#if kercn == 4 83#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x))) 84#endif 85 86#ifdef INTEL_DEVICE 87#define MAD(x,y,z) fma((x),(y),(z)) 88#else 89#define MAD(x,y,z) mad((x),(y),(z)) 90#endif 91 92#define LOAD_LOCAL(col_gl, col_lcl) \ 93 sum0 = co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \ 94 sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0); \ 95 temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows)); \ 96 sum0 = MAD(co1, temp, sum0); \ 97 sum1 = co3 * temp; \ 98 temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \ 99 sum0 = MAD(co2, temp, sum0); \ 100 sum1 = MAD(co2, temp, sum1); \ 101 temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \ 102 sum0 = MAD(co3, temp, sum0); \ 103 sum1 = MAD(co1, temp, sum1); \ 104 smem[0][col_lcl] = sum0; \ 105 sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1); \ 106 sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1); \ 107 smem[1][col_lcl] = sum1; 108 109 110#if kercn == 4 111#define LOAD_LOCAL4(col_gl, col_lcl) \ 112 sum40 = co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \ 113 sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40); \ 114 temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y, src_rows)); \ 115 sum40 = MAD(co1, temp4, sum40); \ 116 sum41 = co3 * temp4; \ 117 temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \ 118 sum40 = MAD(co2, temp4, sum40); \ 119 sum41 = MAD(co2, temp4, sum41); \ 120 temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \ 121 sum40 = MAD(co3, temp4, sum40); \ 122 sum41 = MAD(co1, temp4, sum41); \ 123 vstore4(sum40, col_lcl, (__local float*) &smem[0][2]); \ 124 sum41 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum41); \ 125 sum41 = MAD(co3, SRC4(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum41); \ 126 vstore4(sum41, col_lcl, (__local float*) &smem[1][2]); 127#endif 128 129#define noconvert 130 131__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols, 132 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) 133{ 134 const int x = get_global_id(0)*kercn; 135 const int y = 2*get_global_id(1); 136 137 __local FT smem[2][LOCAL_SIZE + 4]; 138 __global uchar * dstData = dst + dst_offset; 139 __global const uchar * srcData = src + src_offset; 140 141 FT sum0, sum1, temp; 142 FT co1 = 0.375f; 143 FT co2 = 0.25f; 144 FT co3 = 0.0625f; 145 146 const int src_y = 2*y; 147 int col; 148 149 if (src_y >= 2 && src_y < src_rows - 4) 150 { 151#undef EXTRAPOLATE_ 152#define EXTRAPOLATE_(val, maxVal) val 153#if kercn == 1 154 col = EXTRAPOLATE(x, src_cols); 155 LOAD_LOCAL(col, 2 + get_local_id(0)) 156#else 157 if (x < src_cols-4) 158 { 159 float4 sum40, sum41, temp4; 160 LOAD_LOCAL4(x, get_local_id(0)) 161 } 162 else 163 { 164 for (int i=0; i<4; i++) 165 { 166 col = EXTRAPOLATE(x+i, src_cols); 167 LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i) 168 } 169 } 170#endif 171 if (get_local_id(0) < 2) 172 { 173 col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 174 LOAD_LOCAL(col, get_local_id(0)) 175 } 176 else if (get_local_id(0) < 4) 177 { 178 col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 179 LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0)) 180 } 181 } 182 else // need extrapolate y 183 { 184#undef EXTRAPOLATE_ 185#define EXTRAPOLATE_(val, maxVal) EXTRAPOLATE(val, maxVal) 186#if kercn == 1 187 col = EXTRAPOLATE(x, src_cols); 188 LOAD_LOCAL(col, 2 + get_local_id(0)) 189#else 190 if (x < src_cols-4) 191 { 192 float4 sum40, sum41, temp4; 193 LOAD_LOCAL4(x, get_local_id(0)) 194 } 195 else 196 { 197 for (int i=0; i<4; i++) 198 { 199 col = EXTRAPOLATE(x+i, src_cols); 200 LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i) 201 } 202 } 203#endif 204 if (get_local_id(0) < 2) 205 { 206 col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 207 LOAD_LOCAL(col, get_local_id(0)) 208 } 209 else if (get_local_id(0) < 4) 210 { 211 col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols); 212 LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0)) 213 } 214 } 215 216 barrier(CLK_LOCAL_MEM_FENCE); 217 218#if kercn == 1 219 if (get_local_id(0) < LOCAL_SIZE / 2) 220 { 221 const int tid2 = get_local_id(0) * 2; 222 223 const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; 224 225 if (dst_x < dst_cols) 226 { 227 for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++) 228 { 229#if cn == 1 230#if fdepth <= 5 231 FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2)); 232#else 233 FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2)); 234#endif 235#else 236 FT sum = co3 * smem[yin - y][2 + tid2 - 2]; 237 sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum); 238 sum = MAD(co1, smem[yin - y][2 + tid2 ], sum); 239 sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum); 240#endif 241 sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum); 242 storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE); 243 } 244 } 245 } 246#else 247 int tid4 = get_local_id(0) * 4; 248 int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2; 249 if (dst_x < dst_cols - 1) 250 { 251 for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++) 252 { 253 254 FT sum = co3* smem[yin - y][2 + tid4 + 2]; 255 sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum); 256 sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum); 257 sum = MAD(co1, smem[yin - y][2 + tid4 ], sum); 258 sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum); 259 storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE)); 260 261 dst_x ++; 262 sum = co3* smem[yin - y][2 + tid4 + 4]; 263 sum = MAD(co3, smem[yin - y][2 + tid4 ], sum); 264 sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum); 265 sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum); 266 sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum); 267 storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE)); 268 dst_x --; 269 } 270 271 } 272 else if (dst_x < dst_cols) 273 { 274 for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++) 275 { 276 FT sum = co3* smem[yin - y][2 + tid4 + 2]; 277 sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum); 278 sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum); 279 sum = MAD(co1, smem[yin - y][2 + tid4 ], sum); 280 sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum); 281 282 storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE)); 283 } 284 } 285#endif 286 287} 288