1;****************************************************************************** 2;* SIMD optimized SAO functions for HEVC 8bit decoding 3;* 4;* Copyright (c) 2013 Pierre-Edouard LEPERE 5;* Copyright (c) 2014 James Almer 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 32 27 28pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 29pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1 30cextern pb_1 31cextern pb_2 32 33SECTION .text 34 35;****************************************************************************** 36;SAO Band Filter 37;****************************************************************************** 38 39%macro HEVC_SAO_BAND_FILTER_INIT 0 40 and leftq, 31 41 movd xm0, leftd 42 add leftq, 1 43 and leftq, 31 44 movd xm1, leftd 45 add leftq, 1 46 and leftq, 31 47 movd xm2, leftd 48 add leftq, 1 49 and leftq, 31 50 movd xm3, leftd 51 52 SPLATW m0, xm0 53 SPLATW m1, xm1 54 SPLATW m2, xm2 55 SPLATW m3, xm3 56%if mmsize > 16 57 SPLATW m4, [offsetq + 2] 58 SPLATW m5, [offsetq + 4] 59 SPLATW m6, [offsetq + 6] 60 SPLATW m7, [offsetq + 8] 61%else 62 movq m7, [offsetq + 2] 63 SPLATW m4, m7, 0 64 SPLATW m5, m7, 1 65 SPLATW m6, m7, 2 66 SPLATW m7, m7, 3 67%endif 68 69%if ARCH_X86_64 70 pxor m14, m14 71 72%else ; ARCH_X86_32 73 mova [rsp+mmsize*0], m0 74 mova [rsp+mmsize*1], m1 75 mova [rsp+mmsize*2], m2 76 mova [rsp+mmsize*3], m3 77 mova [rsp+mmsize*4], m4 78 mova [rsp+mmsize*5], m5 79 mova [rsp+mmsize*6], m6 80 pxor m0, m0 81 %assign MMSIZE mmsize 82 %define m14 m0 83 %define m13 m1 84 %define m9 m2 85 %define m8 m3 86%endif ; ARCH 87DEFINE_ARGS dst, src, dststride, srcstride, offset, height 88 mov heightd, r7m 89%endmacro 90 91%macro HEVC_SAO_BAND_FILTER_COMPUTE 2 92 psraw %1, %2, 3 93%if ARCH_X86_64 94 pcmpeqw m10, %1, m0 95 pcmpeqw m11, %1, m1 96 pcmpeqw m12, %1, m2 97 pcmpeqw %1, m3 98 pand m10, m4 99 pand m11, m5 100 pand m12, m6 101 pand %1, m7 102 por m10, m11 103 por m12, %1 104 por m10, m12 105 paddw %2, m10 106%else ; ARCH_X86_32 107 pcmpeqw m4, %1, [rsp+MMSIZE*0] 108 pcmpeqw m5, %1, [rsp+MMSIZE*1] 109 pcmpeqw m6, %1, [rsp+MMSIZE*2] 110 pcmpeqw %1, [rsp+MMSIZE*3] 111 pand m4, [rsp+MMSIZE*4] 112 pand m5, [rsp+MMSIZE*5] 113 pand m6, [rsp+MMSIZE*6] 114 pand %1, m7 115 por m4, m5 116 por m6, %1 117 por m4, m6 118 paddw %2, m4 119%endif ; ARCH 120%endmacro 121 122;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, 123; int16_t *sao_offset_val, int sao_left_class, int width, int height); 124%macro HEVC_SAO_BAND_FILTER 2 125cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left 126 HEVC_SAO_BAND_FILTER_INIT 127 128align 16 129.loop: 130%if %1 == 8 131 movq m8, [srcq] 132 punpcklbw m8, m14 133 HEVC_SAO_BAND_FILTER_COMPUTE m9, m8 134 packuswb m8, m14 135 movq [dstq], m8 136%endif ; %1 == 8 137 138%assign i 0 139%rep %2 140 mova m13, [srcq + i] 141 punpcklbw m8, m13, m14 142 HEVC_SAO_BAND_FILTER_COMPUTE m9, m8 143 punpckhbw m13, m14 144 HEVC_SAO_BAND_FILTER_COMPUTE m9, m13 145 packuswb m8, m13 146 mova [dstq + i], m8 147%assign i i+mmsize 148%endrep 149 150%if %1 == 48 151INIT_XMM cpuname 152 153 mova m13, [srcq + i] 154 punpcklbw m8, m13, m14 155 HEVC_SAO_BAND_FILTER_COMPUTE m9, m8 156 punpckhbw m13, m14 157 HEVC_SAO_BAND_FILTER_COMPUTE m9, m13 158 packuswb m8, m13 159 mova [dstq + i], m8 160%if cpuflag(avx2) 161INIT_YMM cpuname 162%endif 163%endif ; %1 == 48 164 165 add dstq, dststrideq ; dst += dststride 166 add srcq, srcstrideq ; src += srcstride 167 dec heightd ; cmp height 168 jnz .loop ; height loop 169 REP_RET 170%endmacro 171 172 173%macro HEVC_SAO_BAND_FILTER_FUNCS 0 174HEVC_SAO_BAND_FILTER 8, 0 175HEVC_SAO_BAND_FILTER 16, 1 176HEVC_SAO_BAND_FILTER 32, 2 177HEVC_SAO_BAND_FILTER 48, 2 178HEVC_SAO_BAND_FILTER 64, 4 179%endmacro 180 181INIT_XMM sse2 182HEVC_SAO_BAND_FILTER_FUNCS 183INIT_XMM avx 184HEVC_SAO_BAND_FILTER_FUNCS 185 186%if HAVE_AVX2_EXTERNAL 187INIT_XMM avx2 188HEVC_SAO_BAND_FILTER 8, 0 189HEVC_SAO_BAND_FILTER 16, 1 190INIT_YMM avx2 191HEVC_SAO_BAND_FILTER 32, 1 192HEVC_SAO_BAND_FILTER 48, 1 193HEVC_SAO_BAND_FILTER 64, 2 194%endif 195 196;****************************************************************************** 197;SAO Edge Filter 198;****************************************************************************** 199 200%define MAX_PB_SIZE 64 201%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE 202%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE 203 204%macro HEVC_SAO_EDGE_FILTER_INIT 0 205%if WIN64 206 movsxd eoq, dword eom 207%elif ARCH_X86_64 208 movsxd eoq, eod 209%else 210 mov eoq, r4m 211%endif 212 lea tmp2q, [pb_eo] 213 movsx a_strideq, byte [tmp2q+eoq*4+1] 214 movsx b_strideq, byte [tmp2q+eoq*4+3] 215 imul a_strideq, EDGE_SRCSTRIDE 216 imul b_strideq, EDGE_SRCSTRIDE 217 movsx tmpq, byte [tmp2q+eoq*4] 218 add a_strideq, tmpq 219 movsx tmpq, byte [tmp2q+eoq*4+2] 220 add b_strideq, tmpq 221%endmacro 222 223%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1 224 pminub m4, m1, m2 225 pminub m5, m1, m3 226 pcmpeqb m2, m4 227 pcmpeqb m3, m5 228 pcmpeqb m4, m1 229 pcmpeqb m5, m1 230 psubb m4, m2 231 psubb m5, m3 232 paddb m4, m6 233 paddb m4, m5 234 235 pshufb m2, m0, m4 236%if %1 > 8 237 punpckhbw m5, m7, m1 238 punpckhbw m4, m2, m7 239 punpcklbw m3, m7, m1 240 punpcklbw m2, m7 241 pmaddubsw m5, m4 242 pmaddubsw m3, m2 243 packuswb m3, m5 244%else 245 punpcklbw m3, m7, m1 246 punpcklbw m2, m7 247 pmaddubsw m3, m2 248 packuswb m3, m3 249%endif 250%endmacro 251 252;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, 253; int eo, int width, int height); 254%macro HEVC_SAO_EDGE_FILTER 2-3 255%if ARCH_X86_64 256cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp 257%define tmp2q heightq 258 HEVC_SAO_EDGE_FILTER_INIT 259 mov heightd, r6m 260 261%else ; ARCH_X86_32 262cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height 263%define eoq srcq 264%define tmpq heightq 265%define tmp2q dststrideq 266%define offsetq heightq 267 HEVC_SAO_EDGE_FILTER_INIT 268 mov srcq, srcm 269 mov offsetq, r3m 270 mov dststrideq, dststridem 271%endif ; ARCH 272 273%if mmsize > 16 274 vbroadcasti128 m0, [offsetq] 275%else 276 movu m0, [offsetq] 277%endif 278 mova m1, [pb_edge_shuffle] 279 packsswb m0, m0 280 mova m7, [pb_1] 281 pshufb m0, m1 282 mova m6, [pb_2] 283%if ARCH_X86_32 284 mov heightd, r6m 285%endif 286 287align 16 288.loop: 289 290%if %1 == 8 291 movq m1, [srcq] 292 movq m2, [srcq + a_strideq] 293 movq m3, [srcq + b_strideq] 294 HEVC_SAO_EDGE_FILTER_COMPUTE %1 295 movq [dstq], m3 296%endif 297 298%assign i 0 299%rep %2 300 mova m1, [srcq + i] 301 movu m2, [srcq + a_strideq + i] 302 movu m3, [srcq + b_strideq + i] 303 HEVC_SAO_EDGE_FILTER_COMPUTE %1 304 mov%3 [dstq + i], m3 305%assign i i+mmsize 306%endrep 307 308%if %1 == 48 309INIT_XMM cpuname 310 311 mova m1, [srcq + i] 312 movu m2, [srcq + a_strideq + i] 313 movu m3, [srcq + b_strideq + i] 314 HEVC_SAO_EDGE_FILTER_COMPUTE %1 315 mova [dstq + i], m3 316%if cpuflag(avx2) 317INIT_YMM cpuname 318%endif 319%endif 320 321 add dstq, dststrideq 322 add srcq, EDGE_SRCSTRIDE 323 dec heightd 324 jg .loop 325 RET 326%endmacro 327 328INIT_XMM ssse3 329HEVC_SAO_EDGE_FILTER 8, 0 330HEVC_SAO_EDGE_FILTER 16, 1, a 331HEVC_SAO_EDGE_FILTER 32, 2, a 332HEVC_SAO_EDGE_FILTER 48, 2, a 333HEVC_SAO_EDGE_FILTER 64, 4, a 334 335%if HAVE_AVX2_EXTERNAL 336INIT_YMM avx2 337HEVC_SAO_EDGE_FILTER 32, 1, a 338HEVC_SAO_EDGE_FILTER 48, 1, u 339HEVC_SAO_EDGE_FILTER 64, 2, a 340%endif 341