1 /* 2 * By downloading, copying, installing or using the software you agree to this license. 3 * If you do not agree to this license, do not download, install, 4 * copy or use the software. 5 * 6 * 7 * License Agreement 8 * For Open Source Computer Vision Library 9 * (3-clause BSD License) 10 * 11 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. 12 * Third party copyrights are property of their respective owners. 13 * 14 * Redistribution and use in source and binary forms, with or without modification, 15 * are permitted provided that the following conditions are met: 16 * 17 * * Redistributions of source code must retain the above copyright notice, 18 * this list of conditions and the following disclaimer. 19 * 20 * * Redistributions in binary form must reproduce the above copyright notice, 21 * this list of conditions and the following disclaimer in the documentation 22 * and/or other materials provided with the distribution. 23 * 24 * * Neither the names of the copyright holders nor the names of the contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * This software is provided by the copyright holders and contributors "as is" and 29 * any express or implied warranties, including, but not limited to, the implied 30 * warranties of merchantability and fitness for a particular purpose are disclaimed. 31 * In no event shall copyright holders or contributors be liable for any direct, 32 * indirect, incidental, special, exemplary, or consequential damages 33 * (including, but not limited to, procurement of substitute goods or services; 34 * loss of use, data, or profits; or business interruption) however caused 35 * and on any theory of liability, whether in contract, strict liability, 36 * or tort (including negligence or otherwise) arising in any way out of 37 * the use of this software, even if advised of the possibility of such damage. 38 */ 39 40 #ifndef CAROTENE_SRC_SEPARABLE_FILTER_HPP 41 #define CAROTENE_SRC_SEPARABLE_FILTER_HPP 42 43 #include "common.hpp" 44 45 #include <carotene/types.hpp> 46 47 #include <vector> 48 49 #ifdef CAROTENE_NEON 50 51 namespace CAROTENE_NS { 52 53 namespace internal { 54 55 struct RowFilter3x3S16Base 56 { 57 typedef u8 srcType; 58 /* 59 Various border types, image boundaries are denoted with '|' 60 61 * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh 62 * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb 63 * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba 64 * BORDER_WRAP: cdefgh|abcdefgh|abcdefg 65 * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i' 66 */ RowFilter3x3S16BaseCAROTENE_NS::internal::RowFilter3x3S16Base67 inline RowFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue, const ptrdiff_t borderxl, const ptrdiff_t borderxr): 68 borderType(_borderType),borderValue(_borderValue) 69 { 70 if (borderType == BORDER_MODE_CONSTANT) 71 { 72 vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x00ffFFffFFffFFffULL : 0x0100FFffFFffFFffULL)); 73 vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0xFF07060504030201ULL : 0x0706050403020100ULL)); 74 } 75 else if (borderType == BORDER_MODE_REFLECT101) 76 { 77 vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0001FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); 78 vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0607060504030201ULL : 0x0706050403020100ULL)); 79 } 80 else //if (borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE) 81 { 82 vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); 83 vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL)); 84 } 85 lookLeft = offsetk - borderxl; 86 lookRight = offsetk - borderxr; 87 } 88 89 uint8x8_t vfmask; 90 uint8x8_t vtmask; 91 enum { offsetk = 1}; 92 ptrdiff_t lookLeft; 93 ptrdiff_t lookRight; 94 const BORDER_MODE borderType; 95 const srcType borderValue; 96 }; 97 98 struct ColFilter3x3S16Base 99 { 100 typedef s16 srcType; 101 ColFilter3x3S16BaseCAROTENE_NS::internal::ColFilter3x3S16Base102 inline ColFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue): 103 borderType(_borderType),borderValue(_borderValue) {} 104 105 enum { offsetk = 1}; 106 const BORDER_MODE borderType; 107 const srcType borderValue; 108 }; 109 110 struct RowFilter3x3S16Generic : public RowFilter3x3S16Base 111 { 112 typedef s16 dstType; 113 RowFilter3x3S16GenericCAROTENE_NS::internal::RowFilter3x3S16Generic114 inline RowFilter3x3S16Generic(BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16 *w): 115 RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter( (w[0]+w[1]+w[2]) * borderValue ) 116 { 117 vw0 = vdupq_n_s16(w[0]); 118 vw1 = vdupq_n_s16(w[1]); 119 vw2 = vdupq_n_s16(w[2]); 120 } 121 122 int16x8_t vw0; 123 int16x8_t vw1; 124 int16x8_t vw2; 125 const dstType borderFilter; 126 operator ()CAROTENE_NS::internal::RowFilter3x3S16Generic127 inline void operator()(const u8* src, s16* dst, ptrdiff_t width) 128 { 129 uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); 130 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 131 l = vset_lane_u8(borderValue, l, 6); 132 133 ptrdiff_t i = 0; 134 for (; i < width - 16 + lookRight; i += 16) 135 { 136 internal::prefetch(src + i); 137 uint8x8_t l18u = vld1_u8(src + i + 1); 138 vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0), 139 vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1), 140 vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2))); 141 l = vld1_u8(src + i + 9); 142 vst1q_s16(dst + i + 8, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 6))), vw0), 143 vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 7))), vw1), 144 vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l)), vw2))); 145 } 146 if (i < width - 8 + lookRight) 147 { 148 uint8x8_t l18u = vld1_u8(src + i + 1); 149 vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0), 150 vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1), 151 vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2))); 152 i += 8; 153 } 154 155 //tail 156 if (lookRight == 0 || i != width) 157 { 158 uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 159 uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); 160 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 161 tail2 = vset_lane_u8(borderValue, tail2, 7); 162 uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); 163 164 int16x8_t l0 = vreinterpretq_s16_u16(vmovl_u8(tail0)); 165 int16x8_t l1 = vreinterpretq_s16_u16(vmovl_u8(tail1)); 166 int16x8_t l2 = vreinterpretq_s16_u16(vmovl_u8(tail2)); 167 168 int16x8_t l0w = vmulq_s16(l0, vw0); 169 int16x8_t l2w = vmulq_s16(l2, vw2); 170 int16x8_t ls = vaddq_s16(vmlaq_s16(l0w, l1, vw1), l2w); 171 172 vst1q_s16(dst + (width - 8), ls); 173 } 174 } 175 }; 176 177 struct RowFilter3x3S16_m101 : public RowFilter3x3S16Base 178 { 179 typedef s16 dstType; 180 RowFilter3x3S16_m101CAROTENE_NS::internal::RowFilter3x3S16_m101181 inline RowFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): 182 RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {} 183 184 const dstType borderFilter; 185 operator ()CAROTENE_NS::internal::RowFilter3x3S16_m101186 inline void operator()(const u8* src, s16* dst, ptrdiff_t width) 187 { 188 uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); 189 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 190 l = vset_lane_u8(borderValue, l, 6); 191 192 ptrdiff_t i = 0; 193 for (; i < width - 16 + lookRight; i += 16) 194 { 195 internal::prefetch(src + i); 196 197 uint8x8_t l2 = vld1_u8(src + i + 1); 198 vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6)))); 199 200 l = vld1_u8(src + i + 9); 201 vst1q_s16(dst + i + 8, vreinterpretq_s16_u16(vsubl_u8(l, vext_u8(l2, l, 6)))); 202 } 203 204 if (i < width - 8 + lookRight) 205 { 206 uint8x8_t l2 = vld1_u8(src + i + 1); 207 vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6)))); 208 i += 8; 209 } 210 211 //tail 212 if (lookRight == 0 || i != width) 213 { 214 uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 215 uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); 216 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 217 tail2 = vset_lane_u8(borderValue, tail2, 7); 218 219 int16x8_t ls = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0)); 220 221 vst1q_s16(dst + (width - 8), ls); 222 } 223 } 224 }; 225 226 struct RowFilter3x3S16_121 : public RowFilter3x3S16Base 227 { 228 typedef s16 dstType; 229 RowFilter3x3S16_121CAROTENE_NS::internal::RowFilter3x3S16_121230 inline RowFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): 231 RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(borderValue << 2) {} 232 233 const dstType borderFilter; 234 operator ()CAROTENE_NS::internal::RowFilter3x3S16_121235 inline void operator()(const u8* src, s16* dst, ptrdiff_t width) 236 { 237 uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); 238 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 239 l = vset_lane_u8(borderValue, l, 6); 240 241 ptrdiff_t i = 0; 242 for (; i < width - 16 + lookRight; i += 16) 243 { 244 internal::prefetch(src + i); 245 246 uint8x8_t l2 = vld1_u8(src + i + 1); 247 vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), 248 vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); 249 250 l = vld1_u8(src + i + 9); 251 vst1q_s16(dst + i + 8, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)), 252 vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1)))); 253 } 254 255 if (i < width - 8 + lookRight) 256 { 257 uint8x8_t l2 = vld1_u8(src + i + 1); 258 vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), 259 vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); 260 i += 8; 261 } 262 263 //tail 264 if (lookRight == 0 || i != width) 265 { 266 uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 267 uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); 268 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 269 tail2 = vset_lane_u8(borderValue, tail2, 7); 270 uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); 271 272 int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2)); 273 int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); 274 275 int16x8_t ls = vqaddq_s16(tail02, tail1x2); 276 277 vst1q_s16(dst + (width - 8), ls); 278 } 279 } 280 }; 281 282 struct RowFilter3x3S16_1m21 : public RowFilter3x3S16Base 283 { 284 typedef s16 dstType; 285 RowFilter3x3S16_1m21CAROTENE_NS::internal::RowFilter3x3S16_1m21286 inline RowFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): 287 RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {} 288 289 const dstType borderFilter; 290 operator ()CAROTENE_NS::internal::RowFilter3x3S16_1m21291 inline void operator()(const u8* src, s16* dst, ptrdiff_t width) 292 { 293 uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); 294 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 295 l = vset_lane_u8(borderValue, l, 6); 296 297 ptrdiff_t i = 0; 298 for (; i < width - 16 + lookRight; i += 16) 299 { 300 internal::prefetch(src + i); 301 302 uint8x8_t l2 = vld1_u8(src + i + 1); 303 vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), 304 vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); 305 306 l = vld1_u8(src + i + 9); 307 vst1q_s16(dst + i + 8, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)), 308 vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1)))); 309 } 310 311 if (i < width - 8 + lookRight) 312 { 313 uint8x8_t l2 = vld1_u8(src + i + 1); 314 vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), 315 vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); 316 i += 8; 317 } 318 319 //tail 320 if (lookRight == 0 || i != width) 321 { 322 uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 323 uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); 324 if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) 325 tail2 = vset_lane_u8(borderValue, tail2, 7); 326 uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); 327 328 int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2)); 329 int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); 330 331 int16x8_t ls = vqsubq_s16(tail02, tail1x2); 332 333 vst1q_s16(dst + (width - 8), ls); 334 } 335 } 336 }; 337 338 struct ColFilter3x3S16Generic : public ColFilter3x3S16Base 339 { 340 typedef s16 dstType; 341 ColFilter3x3S16GenericCAROTENE_NS::internal::ColFilter3x3S16Generic342 inline ColFilter3x3S16Generic(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *w): 343 ColFilter3x3S16Base(_borderType, _borderValue) 344 { 345 vw0 = vdupq_n_s16(w[0]); 346 vw1 = vdupq_n_s16(w[1]); 347 vw2 = vdupq_n_s16(w[2]); 348 } 349 350 int16x8_t vw0; 351 int16x8_t vw1; 352 int16x8_t vw2; 353 operator ()CAROTENE_NS::internal::ColFilter3x3S16Generic354 inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) 355 { 356 ptrdiff_t j = 0; 357 for (; j <= width - 16; j += 16) 358 { 359 int16x8_t line1 = vld1q_s16(src1 + j); 360 int16x8_t line2 = vld1q_s16(src2 + j); 361 vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); 362 vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); 363 364 line1 = vld1q_s16(src1 + j + 8); 365 line2 = vld1q_s16(src2 + j + 8); 366 vst1q_s16(dst0 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), line1, vw1), line2, vw2)); 367 vst1q_s16(dst1 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j + 8), vw2), line1, vw0), line2, vw1)); 368 } 369 if (j <= width - 8) 370 { 371 int16x8_t line1 = vld1q_s16(src1 + j); 372 int16x8_t line2 = vld1q_s16(src2 + j); 373 vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); 374 vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); 375 j += 8; 376 } 377 if (j != width) 378 { 379 j = width - 8; 380 int16x8_t line1 = vld1q_s16(src1 + j); 381 int16x8_t line2 = vld1q_s16(src2 + j); 382 vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); 383 vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); 384 } 385 } 386 operator ()CAROTENE_NS::internal::ColFilter3x3S16Generic387 inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) 388 { 389 if (src0 == 0 || src2 == 0) 390 { 391 int16x8_t vwl1 = vw0; 392 int16x8_t vwl2 = vw2; 393 if (src2 == 0) 394 { 395 src2 = src0; 396 vwl1 = vw2; 397 vwl2 = vw0; 398 } 399 400 int16x8_t v_border = vdupq_n_s16(0); 401 if (borderType == BORDER_MODE_CONSTANT) 402 { 403 v_border = vmulq_s16(vdupq_n_s16(borderValue), vwl1); 404 vwl1 = vw1; 405 } 406 else if (borderType == BORDER_MODE_REFLECT101) 407 { 408 vwl1 = vw1; 409 vwl2 = vaddq_s16(vw0, vw2); 410 } 411 else //replicate\reflect 412 vwl1 = vaddq_s16(vwl1, vw1); 413 414 ptrdiff_t j = 0; 415 for (; j <= width - 16; j += 16) 416 { 417 vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), 418 vmulq_s16(vld1q_s16(src2 + j), vwl2))); 419 vst1q_s16(dst + j + 8, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j + 8), vwl1), 420 vmulq_s16(vld1q_s16(src2 + j + 8), vwl2))); 421 } 422 if (j <= width - 8) 423 { 424 vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), 425 vmulq_s16(vld1q_s16(src2 + j), vwl2))); 426 j += 8; 427 } 428 if (j != width) 429 { 430 j = width - 8; 431 vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), 432 vmulq_s16(vld1q_s16(src2 + j), vwl2))); 433 } 434 } 435 else 436 { 437 ptrdiff_t j = 0; 438 for (; j <= width - 16; j += 16) 439 { 440 vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), 441 vld1q_s16(src1 + j), vw1), 442 vld1q_s16(src2 + j), vw2)); 443 vst1q_s16(dst + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), 444 vld1q_s16(src1 + j + 8), vw1), 445 vld1q_s16(src2 + j + 8), vw2)); 446 } 447 if (j <= width - 8) 448 { 449 vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), 450 vld1q_s16(src1 + j), vw1), 451 vld1q_s16(src2 + j), vw2)); 452 j += 8; 453 } 454 if (j != width) 455 { 456 j = width - 8; 457 vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), 458 vld1q_s16(src1 + j), vw1), 459 vld1q_s16(src2 + j), vw2)); 460 } 461 } 462 } 463 }; 464 465 struct ColFilter3x3S16_m101 : public ColFilter3x3S16Base 466 { 467 typedef s16 dstType; 468 ColFilter3x3S16_m101CAROTENE_NS::internal::ColFilter3x3S16_m101469 inline ColFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *): 470 ColFilter3x3S16Base(_borderType, _borderValue) {} 471 operator ()CAROTENE_NS::internal::ColFilter3x3S16_m101472 inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) 473 { 474 ptrdiff_t j = 0; 475 for (; j <= width - 16; j += 16) 476 { 477 vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 478 vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); 479 vst1q_s16(dst0 + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); 480 vst1q_s16(dst1 + j + 8, vqsubq_s16(vld1q_s16(src3 + j + 8), vld1q_s16(src1 + j + 8))); 481 } 482 if (j <= width - 8) 483 { 484 vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 485 vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); 486 j += 8; 487 } 488 if (j != width) 489 { 490 j = width - 8; 491 vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 492 vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); 493 } 494 } 495 operator ()CAROTENE_NS::internal::ColFilter3x3S16_m101496 inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) 497 { 498 if (src0 == 0 || src2 == 0) 499 { 500 if (borderType == BORDER_MODE_CONSTANT) 501 { 502 int16x8_t v_border = vdupq_n_s16(borderValue); 503 if (src0 == 0) 504 { 505 ptrdiff_t j = 0; 506 for (; j <= width - 16; j += 16) 507 { 508 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); 509 vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), v_border)); 510 } 511 if (j <= width - 8) 512 { 513 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); 514 j += 8; 515 } 516 if (j != width) 517 { 518 j = width - 8; 519 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); 520 } 521 } 522 else 523 { 524 ptrdiff_t j = 0; 525 for (; j <= width - 16; j += 16) 526 { 527 vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); 528 vst1q_s16(dst + j + 8, vqsubq_s16(v_border, vld1q_s16(src0 + j + 8))); 529 } 530 if (j <= width - 8) 531 { 532 vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); 533 j += 8; 534 } 535 if (j != width) 536 { 537 j = width - 8; 538 vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); 539 } 540 } 541 } 542 else if (borderType == BORDER_MODE_REFLECT101) 543 { 544 int16x8_t vzero = vmovq_n_s16(0); 545 ptrdiff_t j = 0; 546 for (; j <= width - 16; j += 16) 547 { 548 vst1q_s16(dst + j, vzero); 549 vst1q_s16(dst + j + 8, vzero); 550 } 551 if (j <= width - 8) 552 { 553 vst1q_s16(dst + j, vzero); 554 j += 8; 555 } 556 if (j != width) 557 { 558 j = width - 8; 559 vst1q_s16(dst + j, vzero); 560 } 561 } 562 else //replicate\reflect 563 { 564 if (src0 == 0) src0 = src1; else src2 = src1; 565 ptrdiff_t j = 0; 566 for (; j <= width - 16; j += 16) 567 { 568 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 569 vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); 570 } 571 if (j <= width - 8) 572 { 573 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 574 j += 8; 575 } 576 if (j != width) 577 { 578 j = width - 8; 579 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 580 } 581 } 582 } 583 else 584 { 585 ptrdiff_t j = 0; 586 for (; j <= width - 16; j += 16) 587 { 588 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 589 vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); 590 } 591 if (j <= width - 8) 592 { 593 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 594 j += 8; 595 } 596 if (j != width) 597 { 598 j = width - 8; 599 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); 600 } 601 } 602 } 603 }; 604 605 struct ColFilter3x3S16_121 : public ColFilter3x3S16Base 606 { 607 typedef s16 dstType; 608 ColFilter3x3S16_121CAROTENE_NS::internal::ColFilter3x3S16_121609 inline ColFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): 610 ColFilter3x3S16Base(_borderType, _borderValue) {} 611 operator ()CAROTENE_NS::internal::ColFilter3x3S16_121612 inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) 613 { 614 ptrdiff_t j = 0; 615 //int16x8_t line0 = vld1q_s16(src0 + j);//1 616 //int16x8_t line1 = vld1q_s16(src1 + j);//11 617 //int16x8_t line2 = vld1q_s16(src2 + j);// 11 618 //int16x8_t line3 = vld1q_s16(src3 + j);// 1 619 for (; j <= width - 16; j += 16) 620 { 621 int16x8_t line1 = vld1q_s16(src1 + j); 622 int16x8_t line2 = vld1q_s16(src2 + j); 623 624 int16x8_t l12 = vqaddq_s16(line1, line2); 625 626 vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); 627 vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); 628 629 line1 = vld1q_s16(src1 + j + 8); 630 line2 = vld1q_s16(src2 + j + 8); 631 632 l12 = vqaddq_s16(line1, line2); 633 634 vst1q_s16(dst0 + j + 8, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), line1), l12)); 635 vst1q_s16(dst1 + j + 8, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j + 8)))); 636 } 637 if (j <= width - 8) 638 { 639 int16x8_t line1 = vld1q_s16(src1 + j); 640 int16x8_t line2 = vld1q_s16(src2 + j); 641 642 int16x8_t l12 = vqaddq_s16(line1, line2); 643 644 vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); 645 vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); 646 j += 8; 647 } 648 if (j != width) 649 { 650 j = width - 8; 651 int16x8_t line1 = vld1q_s16(src1 + j); 652 int16x8_t line2 = vld1q_s16(src2 + j); 653 654 int16x8_t l12 = vqaddq_s16(line1, line2); 655 656 vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); 657 vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); 658 } 659 } 660 operator ()CAROTENE_NS::internal::ColFilter3x3S16_121661 inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) 662 { 663 if (src0 == 0 || src2 == 0) 664 { 665 if (src2 == 0) 666 src2 = src0; 667 668 if (borderType == BORDER_MODE_CONSTANT) 669 { 670 int16x8_t v_border = vdupq_n_s16(borderValue); 671 ptrdiff_t j = 0; 672 for (; j <= width - 16; j += 16) 673 { 674 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), 675 vqaddq_s16(v_border, vld1q_s16(src2 + j)))); 676 vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1), 677 vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)))); 678 } 679 if (j <= width - 8) 680 { 681 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), 682 vqaddq_s16(v_border, vld1q_s16(src2 + j)))); 683 j += 8; 684 } 685 if (j != width) 686 { 687 j = width - 8; 688 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), 689 vqaddq_s16(v_border, vld1q_s16(src2 + j)))); 690 } 691 } 692 else if (borderType == BORDER_MODE_REFLECT101) 693 { 694 ptrdiff_t j = 0; 695 for (; j <= width - 16; j += 16) 696 { 697 vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), 698 vld1q_s16(src2 + j)), 1)); 699 vst1q_s16(dst + j + 8, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j + 8), 700 vld1q_s16(src2 + j + 8)), 1)); 701 } 702 if (j <= width - 8) 703 { 704 vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), 705 vld1q_s16(src2 + j)), 1)); 706 j += 8; 707 } 708 if (j != width) 709 { 710 j = width - 8; 711 vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), 712 vld1q_s16(src2 + j)), 1)); 713 } 714 } 715 else //replicate\reflect 716 { 717 ptrdiff_t j = 0; 718 for (; j <= width - 16; j += 16) 719 { 720 int16x8_t line1 = vld1q_s16(src1 + j); 721 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), 722 vqaddq_s16(line1, vld1q_s16(src2 + j)))); 723 724 line1 = vld1q_s16(src1 + j + 8); 725 vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(line1, 1), 726 vqaddq_s16(line1, vld1q_s16(src2 + j + 8)))); 727 } 728 if (j <= width - 8) 729 { 730 int16x8_t line1 = vld1q_s16(src1 + j); 731 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), 732 vqaddq_s16(line1, vld1q_s16(src2 + j)))); 733 j += 8; 734 } 735 if (j != width) 736 { 737 j = width - 8; 738 int16x8_t line1 = vld1q_s16(src1 + j); 739 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), 740 vqaddq_s16(line1, vld1q_s16(src2 + j)))); 741 } 742 } 743 } 744 else 745 { 746 ptrdiff_t j = 0; 747 for (; j <= width - 16; j += 16) 748 { 749 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), 750 vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); 751 752 vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1), 753 vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)))); 754 } 755 if (j <= width - 8) 756 { 757 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), 758 vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); 759 j += 8; 760 } 761 if (j != width) 762 { 763 j = width - 8; 764 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), 765 vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); 766 } 767 } 768 } 769 }; 770 771 struct ColFilter3x3U8_121 : public ColFilter3x3S16Base 772 { 773 typedef u8 dstType; 774 ColFilter3x3U8_121CAROTENE_NS::internal::ColFilter3x3U8_121775 inline ColFilter3x3U8_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): 776 ColFilter3x3S16Base(_borderType, _borderValue) {} 777 operator ()CAROTENE_NS::internal::ColFilter3x3U8_121778 inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, const srcType* src3, dstType* dst0, dstType* dst1, ptrdiff_t width) 779 { 780 ptrdiff_t j = 0; 781 //int16x8_t line0 = vld1q_s16(src0 + j);//1 782 //int16x8_t line1 = vld1q_s16(src1 + j);//11 783 //int16x8_t line2 = vld1q_s16(src2 + j);// 11 784 //int16x8_t line3 = vld1q_s16(src3 + j);// 1 785 for (; j <= width - 16; j += 16) 786 { 787 int16x8_t line1 = vld1q_s16(src1 + j); 788 int16x8_t line2 = vld1q_s16(src2 + j); 789 790 int16x8_t l12 = vaddq_s16(line1, line2); 791 792 vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); 793 vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); 794 795 line1 = vld1q_s16(src1 + j + 8); 796 line2 = vld1q_s16(src2 + j + 8); 797 798 l12 = vaddq_s16(line1, line2); 799 800 vst1_u8(dst0 + j + 8, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j + 8), line1), l12), 4)); 801 vst1_u8(dst1 + j + 8, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j + 8))), 4)); 802 } 803 if (j <= width - 8) 804 { 805 int16x8_t line1 = vld1q_s16(src1 + j); 806 int16x8_t line2 = vld1q_s16(src2 + j); 807 808 int16x8_t l12 = vaddq_s16(line1, line2); 809 810 vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); 811 vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); 812 j += 8; 813 } 814 if (j != width) 815 { 816 j = width - 8; 817 int16x8_t line1 = vld1q_s16(src1 + j); 818 int16x8_t line2 = vld1q_s16(src2 + j); 819 820 int16x8_t l12 = vaddq_s16(line1, line2); 821 822 vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); 823 vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); 824 } 825 } 826 operator ()CAROTENE_NS::internal::ColFilter3x3U8_121827 inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, dstType* dst, ptrdiff_t width) 828 { 829 if (src0 == 0 || src2 == 0) 830 { 831 if (src2 == 0) 832 src2 = src0; 833 834 if (borderType == BORDER_MODE_CONSTANT) 835 { 836 ptrdiff_t j = 0; 837 int16x8_t v_border = vdupq_n_s16(borderValue); 838 for (; j <= width - 16; j += 16) 839 { 840 //Store normalized result, essential for gaussianBlur 841 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), 842 vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); 843 844 vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1), 845 vaddq_s16(v_border, vld1q_s16(src2 + j + 8))), 4)); 846 } 847 if (j <= width - 8) 848 { 849 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), 850 vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); 851 j += 8; 852 } 853 if (j != width) 854 { 855 j = width - 8; 856 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), 857 vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); 858 } 859 } 860 else if (borderType == BORDER_MODE_REFLECT101) 861 { 862 ptrdiff_t j = 0; 863 for (; j <= width - 16; j += 16) 864 { 865 vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), 866 vld1q_s16(src2 + j)), 1), 4)); 867 vst1_u8(dst + j + 8, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j + 8), 868 vld1q_s16(src2 + j + 8)), 1), 4)); 869 } 870 if (j <= width - 8) 871 { 872 vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), 873 vld1q_s16(src2 + j)), 1), 4)); 874 j += 8; 875 } 876 if (j != width) 877 { 878 j = width - 8; 879 vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), 880 vld1q_s16(src2 + j)), 1), 4)); 881 } 882 } 883 else //replicate\reflect 884 { 885 ptrdiff_t j = 0; 886 for (; j <= width - 16; j += 16) 887 { 888 int16x8_t line1 = vld1q_s16(src1 + j); 889 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), 890 vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); 891 892 line1 = vld1q_s16(src1 + j + 8); 893 vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), 894 vaddq_s16(line1, vld1q_s16(src2 + j + 8))), 4)); 895 } 896 if (j <= width - 8) 897 { 898 int16x8_t line1 = vld1q_s16(src1 + j); 899 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), 900 vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); 901 j += 8; 902 } 903 if (j != width) 904 { 905 j = width - 8; 906 int16x8_t line1 = vld1q_s16(src1 + j); 907 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), 908 vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); 909 } 910 } 911 } 912 else 913 { 914 ptrdiff_t j = 0; 915 for (; j <= width - 16; j += 16) 916 { 917 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), 918 vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); 919 vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1), 920 vaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8))), 4)); 921 } 922 if (j <= width - 8) 923 { 924 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), 925 vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); 926 j += 8; 927 } 928 if (j != width) 929 { 930 j = width - 8; 931 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), 932 vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); 933 } 934 } 935 } 936 }; 937 938 struct ColFilter3x3S16_1m21 : public ColFilter3x3S16Base 939 { 940 typedef s16 dstType; 941 ColFilter3x3S16_1m21CAROTENE_NS::internal::ColFilter3x3S16_1m21942 inline ColFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): 943 ColFilter3x3S16Base(_borderType, _borderValue) {} 944 operator ()CAROTENE_NS::internal::ColFilter3x3S16_1m21945 inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) 946 { 947 ptrdiff_t j = 0; 948 //int16x8_t line0 = vld1q_s16(src0 + j);// 1 949 //int16x8_t line1 = vld1q_s16(src1 + j);//-1 1 950 //int16x8_t line2 = vld1q_s16(src2 + j);// -1 -1 951 //int16x8_t line3 = vld1q_s16(src3 + j);// 1 952 for (; j <= width - 16; j += 16) 953 { 954 int16x8_t line1 = vld1q_s16(src1 + j); 955 int16x8_t line2 = vld1q_s16(src2 + j); 956 957 int16x8_t l12 = vqsubq_s16(line1, line2); 958 959 vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); 960 vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); 961 962 line1 = vld1q_s16(src1 + j + 8); 963 line2 = vld1q_s16(src2 + j + 8); 964 965 l12 = vqsubq_s16(line1, line2); 966 967 vst1q_s16(dst0 + j + 8, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j + 8), line1), l12)); 968 vst1q_s16(dst1 + j + 8, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j + 8), line2), l12)); 969 } 970 if (j <= width - 8) 971 { 972 int16x8_t line1 = vld1q_s16(src1 + j); 973 int16x8_t line2 = vld1q_s16(src2 + j); 974 975 int16x8_t l12 = vqsubq_s16(line1, line2); 976 977 vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); 978 vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); 979 j += 8; 980 } 981 if (j != width) 982 { 983 j = width - 8; 984 int16x8_t line1 = vld1q_s16(src1 + j); 985 int16x8_t line2 = vld1q_s16(src2 + j); 986 987 int16x8_t l12 = vqsubq_s16(line1, line2); 988 989 vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); 990 vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); 991 } 992 } 993 operator ()CAROTENE_NS::internal::ColFilter3x3S16_1m21994 inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) 995 { 996 if (src0 == 0 || src2 == 0) 997 { 998 if (src2 == 0) 999 src2 = src0; 1000 1001 if (borderType == BORDER_MODE_CONSTANT) 1002 { 1003 ptrdiff_t j = 0; 1004 int16x8_t v_border = vdupq_n_s16(borderValue); 1005 for (; j <= width - 16; j += 16) 1006 { 1007 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); 1008 vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)), vshlq_n_s16(vld1q_s16(src1 + j + 8), 1))); 1009 } 1010 if (j <= width - 8) 1011 { 1012 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); 1013 j += 8; 1014 } 1015 if (j != width) 1016 { 1017 j = width - 8; 1018 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); 1019 } 1020 } 1021 else if (borderType == BORDER_MODE_REFLECT101) 1022 { 1023 ptrdiff_t j = 0; 1024 for (; j <= width - 16; j += 16) 1025 { 1026 vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); 1027 vst1q_s16(dst + j + 8, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8)), 1)); 1028 } 1029 if (j <= width - 8) 1030 { 1031 vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); 1032 j += 8; 1033 } 1034 if (j != width) 1035 { 1036 j = width - 8; 1037 vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); 1038 } 1039 } 1040 else //replicate\reflect 1041 { 1042 ptrdiff_t j = 0; 1043 for (; j <= width - 16; j += 16) 1044 { 1045 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); 1046 vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8))); 1047 } 1048 if (j <= width - 8) 1049 { 1050 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); 1051 j += 8; 1052 } 1053 if (j != width) 1054 { 1055 j = width - 8; 1056 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); 1057 } 1058 } 1059 } 1060 else 1061 { 1062 ptrdiff_t j = 0; 1063 for (; j <= width - 16; j += 16) 1064 { 1065 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), 1066 vqshlq_n_s16(vld1q_s16(src1 + j), 1))); 1067 vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)), 1068 vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1))); 1069 } 1070 if (j <= width - 8) 1071 { 1072 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), 1073 vqshlq_n_s16(vld1q_s16(src1 + j), 1))); 1074 j += 8; 1075 } 1076 if (j != width) 1077 { 1078 j = width - 8; 1079 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), 1080 vqshlq_n_s16(vld1q_s16(src1 + j), 1))); 1081 } 1082 } 1083 } 1084 }; 1085 1086 template<class RowFilter, class ColFilter> struct sepFilter3x3 1087 { 1088 typedef typename RowFilter::srcType srcType; 1089 typedef typename RowFilter::dstType tmpType; 1090 typedef typename ColFilter::dstType dstType; 1091 processCAROTENE_NS::internal::sepFilter3x31092 static void process(const Size2D &ssize, 1093 const srcType * srcBase, ptrdiff_t srcStride, 1094 dstType * dstBase, ptrdiff_t dstStride, 1095 const s16 *xw, const s16 *yw, 1096 BORDER_MODE borderType, srcType borderValue, Margin borderMargin) 1097 { 1098 const ptrdiff_t offsetk = 1; 1099 ptrdiff_t borderxl, borderxr, borderyt, borderyb; 1100 borderxl = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left); 1101 borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top); 1102 borderxr = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right); 1103 borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom); 1104 1105 std::vector<tmpType> _buf(ssize.width << 2); 1106 tmpType * buf = &_buf[0]; 1107 1108 RowFilter filterX(borderType, borderValue, borderxl, borderxr, xw); 1109 ColFilter filterY(borderType, filterX.borderFilter, yw); 1110 const ptrdiff_t lookTop = offsetk - borderyt; 1111 const ptrdiff_t lookBottom = offsetk - borderyb; 1112 1113 const srcType* src = srcBase - lookTop * srcStride / sizeof(srcType); 1114 dstType* dst = dstBase; 1115 1116 ptrdiff_t ridx = -lookTop; 1117 for (; ridx <= (ptrdiff_t)ssize.height + lookBottom - 2; ridx += 2) 1118 { 1119 for (ptrdiff_t bidx = 0; bidx < 2; ++bidx, src += srcStride / sizeof(srcType)) 1120 filterX(src, buf + ssize.width * ((4 + ridx + bidx) % 4), ssize.width); 1121 1122 if (ridx <= 0) 1123 { 1124 if (ridx == 0) //first row 1125 { 1126 filterY(0, buf + ssize.width * ((ridx + 4) % 4), buf + ssize.width * ((ridx + 1) % 4), dst, ssize.width); 1127 dst += dstStride / sizeof(dstType); 1128 } 1129 continue; 1130 } 1131 1132 filterY(buf + ssize.width * ((ridx + 2) % 4), 1133 buf + ssize.width * ((ridx + 3) % 4), 1134 buf + ssize.width * ((ridx + 4) % 4), 1135 buf + ssize.width * ((ridx + 1) % 4), 1136 dst, dst + dstStride / sizeof(dstType), ssize.width); 1137 1138 dst += dstStride * 2 / sizeof(dstType); 1139 } 1140 1141 if (ridx < (ptrdiff_t)ssize.height + lookBottom) 1142 { 1143 filterX(src, buf + ssize.width * ((4 + ridx) % 4), ssize.width); 1144 filterY(buf + ssize.width * ((2 + ridx) % 4), 1145 buf + ssize.width * ((3 + ridx) % 4), 1146 buf + ssize.width * ((4 + ridx) % 4), dst, ssize.width); 1147 dst += dstStride / sizeof(dstType); 1148 ridx++; 1149 } 1150 if (lookBottom == 0) 1151 filterY(buf + ssize.width * ((ridx + 2) % 4), buf + ssize.width * ((ridx + 3) % 4), 0, dst, ssize.width); 1152 } 1153 }; 1154 1155 } //namespace internal 1156 1157 } //namespace CAROTENE_NS 1158 1159 #endif // CAROTENE_NEON 1160 1161 #endif // CAROTENE_SRC_REMAP_HPP 1162