1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #ifndef AOM_DSP_X86_CONVOLVE_H_ 12 #define AOM_DSP_X86_CONVOLVE_H_ 13 14 #include <assert.h> 15 16 #include "./aom_config.h" 17 #include "aom/aom_integer.h" 18 #include "aom_ports/mem.h" 19 #include "aom_dsp/aom_convolve.h" 20 21 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, 22 uint8_t *output_ptr, ptrdiff_t out_pitch, 23 uint32_t output_height, const int16_t *filter); 24 25 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ 26 void aom_convolve8_##name##_##opt( \ 27 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ 28 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ 29 const int16_t *filter_y, int y_step_q4, int w, int h) { \ 30 (void)filter_x; \ 31 (void)x_step_q4; \ 32 (void)filter_y; \ 33 (void)y_step_q4; \ 34 assert((-128 <= filter[3]) && (filter[3] <= 127)); \ 35 assert(step_q4 == 16); \ 36 if (filter[0] | filter[1] | filter[2]) { \ 37 while (w >= 16) { \ 38 aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ 39 dst_stride, h, filter); \ 40 src += 16; \ 41 dst += 16; \ 42 w -= 16; \ 43 } \ 44 while (w >= 8) { \ 45 aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ 46 dst_stride, h, filter); \ 47 src += 8; \ 48 dst += 8; \ 49 w -= 8; \ 50 } \ 51 while (w >= 4) { \ 52 aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ 53 dst_stride, h, filter); \ 54 src += 4; \ 55 dst += 4; \ 56 w -= 4; \ 57 } \ 58 } else { \ 59 while (w >= 16) { \ 60 aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ 61 dst_stride, h, filter); \ 62 src += 16; \ 63 dst += 16; \ 64 w -= 16; \ 65 } \ 66 while (w >= 8) { \ 67 aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ 68 dst_stride, h, filter); \ 69 src += 8; \ 70 dst += 8; \ 71 w -= 8; \ 72 } \ 73 while (w >= 4) { \ 74 aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ 75 dst_stride, h, filter); \ 76 src += 4; \ 77 dst += 4; \ 78 w -= 4; \ 79 } \ 80 } \ 81 if (w) { \ 82 aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ 83 x_step_q4, filter_y, y_step_q4, w, h); \ 84 } \ 85 } 86 87 #define FUN_CONV_2D(avg, opt) \ 88 void aom_convolve8_##avg##opt( \ 89 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ 90 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ 91 const int16_t *filter_y, int y_step_q4, int w, int h) { \ 92 assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ 93 assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ 94 assert(w <= MAX_SB_SIZE); \ 95 assert(h <= MAX_SB_SIZE); \ 96 assert(x_step_q4 == 16); \ 97 assert(y_step_q4 == 16); \ 98 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] || \ 99 filter_y[1] || filter_y[2]) { \ 100 DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ 101 aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, \ 102 MAX_SB_SIZE, filter_x, x_step_q4, filter_y, \ 103 y_step_q4, w, h + 7); \ 104 aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ 105 dst, dst_stride, filter_x, x_step_q4, \ 106 filter_y, y_step_q4, w, h); \ 107 } else { \ 108 DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ 109 aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \ 110 filter_x, x_step_q4, filter_y, y_step_q4, w, \ 111 h + 1); \ 112 aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \ 113 filter_x, x_step_q4, filter_y, \ 114 y_step_q4, w, h); \ 115 } \ 116 } 117 118 #if CONFIG_LOOP_RESTORATION 119 // convolve_add_src is only used by the Wiener filter, which will never 120 // end up calling the bilinear functions (it uses a symmetric filter, so 121 // the possible numbers of taps are 1,3,5,7) 122 #define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \ 123 opt) \ 124 void aom_convolve8_##name##_##opt( \ 125 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ 126 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ 127 const int16_t *filter_y, int y_step_q4, int w, int h) { \ 128 (void)filter_x; \ 129 (void)x_step_q4; \ 130 (void)filter_y; \ 131 (void)y_step_q4; \ 132 assert((-128 <= filter[3]) && (filter[3] <= 127)); \ 133 assert(step_q4 == 16); \ 134 while (w >= 16) { \ 135 aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ 136 dst_stride, h, filter); \ 137 src += 16; \ 138 dst += 16; \ 139 w -= 16; \ 140 } \ 141 while (w >= 8) { \ 142 aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ 143 dst_stride, h, filter); \ 144 src += 8; \ 145 dst += 8; \ 146 w -= 8; \ 147 } \ 148 while (w >= 4) { \ 149 aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ 150 dst_stride, h, filter); \ 151 src += 4; \ 152 dst += 4; \ 153 w -= 4; \ 154 } \ 155 if (w) { \ 156 aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ 157 x_step_q4, filter_y, y_step_q4, w, h); \ 158 } \ 159 } 160 161 #define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \ 162 void aom_convolve8_##type##opt( \ 163 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ 164 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ 165 const int16_t *filter_y, int y_step_q4, int w, int h) { \ 166 DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ 167 assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ 168 assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ 169 assert(w <= MAX_SB_SIZE); \ 170 assert(h <= MAX_SB_SIZE); \ 171 assert(x_step_q4 == 16); \ 172 assert(y_step_q4 == 16); \ 173 aom_convolve8_##htype##horiz_##opt( \ 174 src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \ 175 x_step_q4, filter_y, y_step_q4, w, h + 7); \ 176 aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ 177 dst, dst_stride, filter_x, x_step_q4, \ 178 filter_y, y_step_q4, w, h); \ 179 } 180 #endif 181 182 #if CONFIG_HIGHBITDEPTH 183 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, 184 const ptrdiff_t src_pitch, 185 uint16_t *output_ptr, 186 ptrdiff_t out_pitch, 187 unsigned int output_height, 188 const int16_t *filter, int bd); 189 190 #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ 191 void aom_highbd_convolve8_##name##_##opt( \ 192 const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ 193 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ 194 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ 195 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 196 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 197 if (step_q4 == 16 && filter[3] != 128) { \ 198 if (filter[0] | filter[1] | filter[2]) { \ 199 while (w >= 16) { \ 200 aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ 201 src_start, src_stride, dst, dst_stride, h, filter, bd); \ 202 src += 16; \ 203 dst += 16; \ 204 w -= 16; \ 205 } \ 206 while (w >= 8) { \ 207 aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ 208 src_start, src_stride, dst, dst_stride, h, filter, bd); \ 209 src += 8; \ 210 dst += 8; \ 211 w -= 8; \ 212 } \ 213 while (w >= 4) { \ 214 aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ 215 src_start, src_stride, dst, dst_stride, h, filter, bd); \ 216 src += 4; \ 217 dst += 4; \ 218 w -= 4; \ 219 } \ 220 } else { \ 221 while (w >= 16) { \ 222 aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ 223 src, src_stride, dst, dst_stride, h, filter, bd); \ 224 src += 16; \ 225 dst += 16; \ 226 w -= 16; \ 227 } \ 228 while (w >= 8) { \ 229 aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ 230 src, src_stride, dst, dst_stride, h, filter, bd); \ 231 src += 8; \ 232 dst += 8; \ 233 w -= 8; \ 234 } \ 235 while (w >= 4) { \ 236 aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ 237 src, src_stride, dst, dst_stride, h, filter, bd); \ 238 src += 4; \ 239 dst += 4; \ 240 w -= 4; \ 241 } \ 242 } \ 243 } \ 244 if (w) { \ 245 aom_highbd_convolve8_##name##_c( \ 246 CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ 247 dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ 248 } \ 249 } 250 251 #define HIGH_FUN_CONV_2D(avg, opt) \ 252 void aom_highbd_convolve8_##avg##opt( \ 253 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ 254 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ 255 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ 256 assert(w <= MAX_SB_SIZE); \ 257 assert(h <= MAX_SB_SIZE); \ 258 if (x_step_q4 == 16 && y_step_q4 == 16) { \ 259 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ 260 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ 261 DECLARE_ALIGNED(16, uint16_t, \ 262 fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ 263 aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ 264 CONVERT_TO_BYTEPTR(fdata2), \ 265 MAX_SB_SIZE, filter_x, x_step_q4, \ 266 filter_y, y_step_q4, w, h + 7, bd); \ 267 aom_highbd_convolve8_##avg##vert_##opt( \ 268 CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst, \ 269 dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ 270 } else { \ 271 DECLARE_ALIGNED(16, uint16_t, \ 272 fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ 273 aom_highbd_convolve8_horiz_##opt( \ 274 src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, \ 275 filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ 276 aom_highbd_convolve8_##avg##vert_##opt( \ 277 CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride, \ 278 filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ 279 } \ 280 } else { \ 281 aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ 282 filter_x, x_step_q4, filter_y, y_step_q4, \ 283 w, h, bd); \ 284 } \ 285 } 286 #endif // CONFIG_HIGHBITDEPTH 287 288 #endif // AOM_DSP_X86_CONVOLVE_H_ 289