1;****************************************************************************** 2;* be_blur.asm: SSE2 \be blur 3;****************************************************************************** 4;* Copyright (C) 2013 Rodger Combs <rcombs@rcombs.me> 5;* 6;* This file is part of libass. 7;* 8;* Permission to use, copy, modify, and distribute this software for any 9;* purpose with or without fee is hereby granted, provided that the above 10;* copyright notice and this permission notice appear in all copies. 11;* 12;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 13;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 14;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 15;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 16;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 17;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 18;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 19;****************************************************************************** 20 21%include "x86inc.asm" 22 23SECTION_RODATA 32 24low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF 25 26SECTION .text 27 28;------------------------------------------------------------------------------ 29; void be_blur_pass( uint8_t *buf, unsigned width, 30; unsigned height, unsigned stride, 31; uint16_t *tmp); 32;------------------------------------------------------------------------------ 33 34INIT_XMM sse2 35cglobal be_blur, 5,15,9 36.skip_prologue: 37 mov r6, 2 ; int x = 2; 38 pxor xmm6, xmm6 ; __m128i temp3 = 0; 39 mov r7, r0 ; unsigned char *src=buf; 40 movzx r8, byte [r7 + 1] ; int old_pix = src[1]; 41 movzx r9, byte [r7] ; int old_sum = src[0]; 42 add r9, r8 ; old_sum += old_pix; 43 lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2; 44 lea r14, [r1 - 2] ; tmpreg = (w-2); 45 and r14, -8 ; tmpreg &= (~7); 46.first_loop 47 movzx r10, byte [r7 + r6] ; int temp1 = src[x]; 48 lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; 49 mov r8, r10 ; old_pix = temp1; 50 lea r10, [r9 + r11] ; temp1 = old_sum + temp2; 51 mov r9, r11 ; old_sum = temp2; 52 mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; 53 inc r6 ; x++ 54 cmp r6, r1 ; x < w 55 jl .first_loop 56 mov r6, 2 ; int x = 2; 57 lea r7, [r0 + r3] ; unsigned char *src=buf+stride; 58 movzx r8, byte [r7 + 1] ; int old_pix = src[1]; 59 movzx r9, byte [r7] ; int old_sum = src[0]; 60 add r9, r8 ; old_sum += old_pix 61.second_loop 62 movzx r10, byte [r7 + r6] ; int temp1 = src[x]; 63 lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; 64 mov r8, r10 ; old_pix = temp1; 65 lea r10, [r9 + r11] ; temp1 = old_sum + temp2; 66 mov r9, r11 ; old_sum = temp2; 67 movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; 68 add r11, r10 ; temp2 += temp1; 69 mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; 70 mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; 71 inc r6 ; x++ 72 cmp r6, r1 ; x < w 73 jl .second_loop 74 mov r5, 2 ; int y = 2; 75.height_loop 76 mov r10, r5; int tmpreg = y; 77 imul r10, r3; tmpreg *= stride; 78 lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride; 79 sub r10, r3 ; tmpreg -= stride; 80 lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride; 81 mov r6, 2 ; int x = 2; 82 movzx r10, byte [r7] ; temp1 = src[0]; 83 movzx r11, byte [r7 + 1] ; temp2 = src[1]; 84 add r10, r11; temp1 += temp2 85 movd xmm0, r10; __m128i old_pix_128 = temp2; 86 movd xmm1, r11; __m128i old_sum_128 = temp1; 87.width_loop 88 movq xmm2, [r7 + r6]; __m128i new_pix = (src+x); 89 punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3); 90 movdqa xmm3, xmm2 ; __m128i temp = new_pix; 91 pslldq xmm3, 2 ; temp = temp << 2 * 8; 92 paddw xmm3, xmm0 ; temp = _mm_add_epi16(temp, old_pix_128); 93 paddw xmm3, xmm2 ; temp = _mm_add_epi16(temp, new_pix); 94 movdqa xmm0, xmm2 ; old_pix_128 = new_pix; 95 psrldq xmm0, 14 ; old_pix_128 = old_pix_128 >> 14 * 8; 96 movdqa xmm2, xmm3 ; new_pix = temp; 97 pslldq xmm2, 2 ; new_pix = new_pix << 2 * 8; 98 paddw xmm2, xmm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128); 99 paddw xmm2, xmm3 ; new_pix = _mm_add_epi16(new_pix, temp); 100 movdqa xmm1, xmm3 ; old_sum_128 = temp; 101 psrldq xmm1, 14 ; old_sum_128 = old_sum_128 >> 14 * 8; 102 movdqu xmm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x); 103 movdqu [r4 + r6 * 2], xmm2 ; *(col_pix_buf+x) = new_pix ; 104 movdqu xmm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x); 105 movdqa xmm3, xmm2 ; temp = new_pix; 106 paddw xmm3, xmm4 ; temp = _mm_add_epi16(temp, old_col_pix); 107 movdqu [r12 + r6 * 2], xmm3 ; *(col_sum_buf+x) = temp; 108 paddw xmm5, xmm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp); 109 psrlw xmm5, 4 ; old_col_sum = old_col_sum >> 4; 110 packuswb xmm5, xmm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum); 111 movq qword [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum; 112 add r6, 8; x += 8; 113 cmp r6, r14; x < ((w - 2) & (~7)); 114 jl .width_loop 115 movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1]; 116 movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2]; 117 add r9, r8 118 jmp .final_width_check 119.final_width_loop 120 movzx r10, byte [r7 + r6] ; temp1 = src[x]; 121 lea r11, [r8 + r10] ; temp2 = old_pix + temp1; 122 mov r8, r10 ; old_pix = temp1; 123 lea r10, [r9 + r11] ; temp1 = old_sum + temp2; 124 mov r9, r11 ; old_sum = temp2; 125 movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; 126 add r11, r10 ; temp2 += temp1; 127 mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; 128 movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x]; 129 add r10, r11 ; temp1 += temp2; 130 shr r10, 4 ; temp1 >>= 4; 131 mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1 132 mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; 133 inc r6 ; x++ 134.final_width_check 135 cmp r6, r1 ; x < w 136 jl .final_width_loop 137 inc r5 ; y++; 138 cmp r5, r2 ; y < h; 139 jl .height_loop 140 RET 141 142INIT_YMM avx2 143cglobal be_blur, 5,15,9 144 cmp r1, 32 145 jl be_blur_sse2.skip_prologue 146 mov r6, 2 ; int x = 2; 147 vpxor ymm6, ymm6 ; __m128i temp3 = 0; 148 mov r7, r0 ; unsigned char *src=buf; 149 movzx r8, byte [r7 + 1] ; int old_pix = src[1]; 150 movzx r9, byte [r7] ; int old_sum = src[0]; 151 add r9, r8 ; old_sum += old_pix; 152 lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2; 153 lea r14, [r1 - 2] ; tmpreg = (w-2); 154 and r14, -16 ; tmpreg &= (~15); 155 vmovdqa ymm7, [low_word_zero wrt rip] 156.first_loop 157 movzx r10, byte [r7 + r6] ; int temp1 = src[x]; 158 lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; 159 mov r8, r10 ; old_pix = temp1; 160 lea r10, [r9 + r11] ; temp1 = old_sum + temp2; 161 mov r9, r11 ; old_sum = temp2; 162 mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; 163 inc r6 ; x++ 164 cmp r6, r1 ; x < w 165 jl .first_loop 166 mov r6, 2 ; int x = 2; 167 lea r7, [r0 + r3] ; unsigned char *src=buf+stride; 168 movzx r8, byte [r7 + 1] ; int old_pix = src[1]; 169 movzx r9, byte [r7] ; int old_sum = src[0]; 170 add r9, r8 ; old_sum += old_pix 171.second_loop 172 movzx r10, byte [r7 + r6] ; int temp1 = src[x]; 173 lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; 174 mov r8, r10 ; old_pix = temp1; 175 lea r10, [r9 + r11] ; temp1 = old_sum + temp2; 176 mov r9, r11 ; old_sum = temp2; 177 movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; 178 add r11, r10 ; temp2 += temp1; 179 mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; 180 mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; 181 inc r6 ; x++ 182 cmp r6, r1 ; x < w 183 jl .second_loop 184 mov r5, 2 ; int y = 2; 185.height_loop 186 mov r10, r5; int tmpreg = y; 187 imul r10, r3; tmpreg *= stride; 188 lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride; 189 sub r10, r3 ; tmpreg -= stride; 190 lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride; 191 mov r6, 2 ; int x = 2; 192 movzx r10, byte [r7] ; temp1 = src[0]; 193 movzx r11, byte [r7 + 1] ; temp2 = src[1]; 194 add r10, r11; temp1 += temp2 195 vmovd xmm0, r10d; __m128i old_pix_128 = temp2; 196 vmovd xmm1, r11d; __m128i old_sum_128 = temp1; 197.width_loop 198 vpermq ymm2, [r7 + r6], 0x10 199 vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3); 200 vpermq ymm8, ymm2, 0x4e 201 vpalignr ymm3, ymm2, ymm8, 14 202 vpand ymm3, ymm3, ymm7 203 vpaddw ymm3, ymm0 ; temp = _mm_add_epi16(temp, old_pix_128); 204 vpaddw ymm3, ymm2 ; temp = _mm_add_epi16(temp, new_pix); 205 vperm2i128 ymm0, ymm2, ymm6, 0x21 206 vpsrldq ymm0, ymm0, 14; temp = temp >> 14 * 8; 207 vpermq ymm8, ymm3, 0x4e 208 vpand ymm8, ymm8, ymm7; 209 vpalignr ymm2, ymm3, ymm8, 14 210 vpand ymm2, ymm2, ymm7 211 vpaddw ymm2, ymm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128); 212 vpaddw ymm2, ymm3 ; new_pix = _mm_add_epi16(new_pix, temp); 213 vperm2i128 ymm1, ymm3, ymm6, 0x21 214 vpsrldq ymm1, ymm1, 14; temp = temp << 2 * 8; 215 vmovdqu ymm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x); 216 vmovdqu [r4 + r6 * 2], ymm2 ; *(col_pix_buf+x) = new_pix ; 217 vmovdqu ymm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x); 218 vpaddw ymm3, ymm2, ymm4 219 vmovdqu [r12 + r6 * 2], ymm3 ; *(col_sum_buf+x) = temp; 220 vpaddw ymm5, ymm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp); 221 vpsrlw ymm5, 4 ; old_col_sum = old_col_sum >> 4; 222 vpackuswb ymm5, ymm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum); 223 vpermq ymm5, ymm5, 11_01_10_00b 224 vmovdqu [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum; 225 add r6, 16; x += 16; 226 cmp r6, r14; x < ((w - 2) & (~15)); 227 jl .width_loop 228 movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1]; 229 movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2]; 230 add r9, r8 231 jmp .final_width_check 232.final_width_loop 233 movzx r10, byte [r7 + r6] ; temp1 = src[x]; 234 lea r11, [r8 + r10] ; temp2 = old_pix + temp1; 235 mov r8, r10 ; old_pix = temp1; 236 lea r10, [r9 + r11] ; temp1 = old_sum + temp2; 237 mov r9, r11 ; old_sum = temp2; 238 movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; 239 add r11, r10 ; temp2 += temp1; 240 mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; 241 movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x]; 242 add r10, r11 ; temp1 += temp2; 243 shr r10, 4 ; temp1 >>= 4; 244 mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1 245 mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; 246 inc r6 ; x++ 247.final_width_check 248 cmp r6, r1 ; x < w 249 jl .final_width_loop 250 inc r5 ; y++; 251 cmp r5, r2 ; y < h; 252 jl .height_loop 253 RET 254 255