1;******************************************************************************
2;* be_blur.asm: SSE2 \be blur
3;******************************************************************************
4;* Copyright (C) 2013 Rodger Combs <rcombs@rcombs.me>
5;*
6;* This file is part of libass.
7;*
8;* Permission to use, copy, modify, and distribute this software for any
9;* purpose with or without fee is hereby granted, provided that the above
10;* copyright notice and this permission notice appear in all copies.
11;*
12;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19;******************************************************************************
20
21%include "x86inc.asm"
22
23SECTION_RODATA 32
24low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
25
26SECTION .text
27
28;------------------------------------------------------------------------------
29; void be_blur_pass( uint8_t *buf, unsigned width,
30;                    unsigned height, unsigned stride,
31;                    uint16_t *tmp);
32;------------------------------------------------------------------------------
33
34INIT_XMM sse2
35cglobal be_blur, 5,15,9
36.skip_prologue:
37    mov r6, 2 ; int x = 2;
38    pxor xmm6, xmm6 ; __m128i temp3 = 0;
39    mov r7, r0 ; unsigned char *src=buf;
40    movzx r8, byte [r7 + 1] ; int old_pix = src[1];
41    movzx r9, byte [r7] ; int old_sum = src[0];
42    add r9, r8 ; old_sum += old_pix;
43    lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
44    lea r14, [r1 - 2] ; tmpreg = (w-2);
45    and r14, -8 ; tmpreg &= (~7);
46.first_loop
47    movzx r10, byte [r7 + r6] ; int temp1 = src[x];
48    lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
49    mov r8, r10 ; old_pix = temp1;
50    lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
51    mov r9, r11 ; old_sum = temp2;
52    mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
53    inc r6 ; x++
54    cmp r6, r1 ; x < w
55    jl .first_loop
56    mov r6, 2 ; int x = 2;
57    lea r7, [r0 + r3] ; unsigned char *src=buf+stride;
58    movzx r8, byte [r7 + 1] ; int old_pix = src[1];
59    movzx r9, byte [r7] ; int old_sum = src[0];
60    add r9, r8 ; old_sum += old_pix
61.second_loop
62    movzx r10, byte [r7 + r6] ; int temp1 = src[x];
63    lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
64    mov r8, r10 ; old_pix = temp1;
65    lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
66    mov r9, r11 ; old_sum = temp2;
67    movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
68    add r11, r10 ; temp2 += temp1;
69    mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
70    mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
71    inc r6 ; x++
72    cmp r6, r1 ; x < w
73    jl .second_loop
74    mov r5, 2 ; int y = 2;
75.height_loop
76    mov r10, r5; int tmpreg = y;
77    imul r10, r3; tmpreg *= stride;
78    lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
79    sub r10, r3 ; tmpreg -= stride;
80    lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride;
81    mov r6, 2 ; int x = 2;
82    movzx r10, byte [r7] ; temp1 = src[0];
83    movzx r11, byte [r7 + 1] ; temp2 = src[1];
84    add r10, r11; temp1 += temp2
85    movd xmm0, r10; __m128i old_pix_128 = temp2;
86    movd xmm1, r11; __m128i old_sum_128 = temp1;
87.width_loop
88    movq xmm2, [r7 + r6]; __m128i new_pix = (src+x);
89    punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
90    movdqa xmm3, xmm2 ; __m128i temp = new_pix;
91    pslldq xmm3, 2 ; temp = temp << 2 * 8;
92    paddw xmm3, xmm0 ; temp = _mm_add_epi16(temp, old_pix_128);
93    paddw xmm3, xmm2 ; temp = _mm_add_epi16(temp, new_pix);
94    movdqa xmm0, xmm2 ; old_pix_128 = new_pix;
95    psrldq xmm0, 14 ; old_pix_128 = old_pix_128 >> 14 * 8;
96    movdqa xmm2, xmm3 ; new_pix = temp;
97    pslldq xmm2, 2 ; new_pix = new_pix << 2 * 8;
98    paddw xmm2, xmm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128);
99    paddw xmm2, xmm3 ; new_pix = _mm_add_epi16(new_pix, temp);
100    movdqa xmm1, xmm3 ; old_sum_128 = temp;
101    psrldq xmm1, 14 ; old_sum_128 = old_sum_128 >> 14 * 8;
102    movdqu xmm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x);
103    movdqu [r4 + r6 * 2], xmm2 ; *(col_pix_buf+x) = new_pix ;
104    movdqu xmm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x);
105    movdqa xmm3, xmm2 ; temp = new_pix;
106    paddw xmm3, xmm4 ; temp = _mm_add_epi16(temp, old_col_pix);
107    movdqu [r12 + r6 * 2], xmm3 ; *(col_sum_buf+x) = temp;
108    paddw xmm5, xmm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp);
109    psrlw xmm5, 4 ; old_col_sum = old_col_sum >> 4;
110    packuswb xmm5, xmm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
111    movq qword [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum;
112    add r6, 8; x += 8;
113    cmp r6, r14; x < ((w - 2) & (~7));
114    jl .width_loop
115    movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1];
116    movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
117    add r9, r8
118    jmp .final_width_check
119.final_width_loop
120    movzx r10, byte [r7 + r6] ; temp1 = src[x];
121    lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
122    mov r8, r10 ; old_pix = temp1;
123    lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
124    mov r9, r11 ; old_sum = temp2;
125    movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
126    add r11, r10 ; temp2 += temp1;
127    mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
128    movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x];
129    add r10, r11 ; temp1 += temp2;
130    shr r10, 4 ; temp1 >>= 4;
131    mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
132    mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
133    inc r6 ; x++
134.final_width_check
135    cmp r6, r1 ; x < w
136    jl .final_width_loop
137    inc r5 ; y++;
138    cmp r5, r2 ; y < h;
139    jl .height_loop
140    RET
141
142INIT_YMM avx2
143cglobal be_blur, 5,15,9
144    cmp r1, 32
145    jl be_blur_sse2.skip_prologue
146    mov r6, 2 ; int x = 2;
147    vpxor ymm6, ymm6 ; __m128i temp3 = 0;
148    mov r7, r0 ; unsigned char *src=buf;
149    movzx r8, byte [r7 + 1] ; int old_pix = src[1];
150    movzx r9, byte [r7] ; int old_sum = src[0];
151    add r9, r8 ; old_sum += old_pix;
152    lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
153    lea r14, [r1 - 2] ; tmpreg = (w-2);
154    and r14, -16 ; tmpreg &= (~15);
155    vmovdqa ymm7, [low_word_zero wrt rip]
156.first_loop
157    movzx r10, byte [r7 + r6] ; int temp1 = src[x];
158    lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
159    mov r8, r10 ; old_pix = temp1;
160    lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
161    mov r9, r11 ; old_sum = temp2;
162    mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
163    inc r6 ; x++
164    cmp r6, r1 ; x < w
165    jl .first_loop
166    mov r6, 2 ; int x = 2;
167    lea r7, [r0 + r3] ; unsigned char *src=buf+stride;
168    movzx r8, byte [r7 + 1] ; int old_pix = src[1];
169    movzx r9, byte [r7] ; int old_sum = src[0];
170    add r9, r8 ; old_sum += old_pix
171.second_loop
172    movzx r10, byte [r7 + r6] ; int temp1 = src[x];
173    lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
174    mov r8, r10 ; old_pix = temp1;
175    lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
176    mov r9, r11 ; old_sum = temp2;
177    movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
178    add r11, r10 ; temp2 += temp1;
179    mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
180    mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
181    inc r6 ; x++
182    cmp r6, r1 ; x < w
183    jl .second_loop
184    mov r5, 2 ; int y = 2;
185.height_loop
186    mov r10, r5; int tmpreg = y;
187    imul r10, r3; tmpreg *= stride;
188    lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
189    sub r10, r3 ; tmpreg -= stride;
190    lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride;
191    mov r6, 2 ; int x = 2;
192    movzx r10, byte [r7] ; temp1 = src[0];
193    movzx r11, byte [r7 + 1] ; temp2 = src[1];
194    add r10, r11; temp1 += temp2
195    vmovd xmm0, r10d; __m128i old_pix_128 = temp2;
196    vmovd xmm1, r11d; __m128i old_sum_128 = temp1;
197.width_loop
198    vpermq ymm2, [r7 + r6], 0x10
199    vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
200    vpermq ymm8, ymm2, 0x4e
201    vpalignr ymm3, ymm2, ymm8, 14
202    vpand ymm3, ymm3, ymm7
203    vpaddw ymm3, ymm0 ; temp = _mm_add_epi16(temp, old_pix_128);
204    vpaddw ymm3, ymm2 ; temp = _mm_add_epi16(temp, new_pix);
205    vperm2i128 ymm0, ymm2, ymm6, 0x21
206    vpsrldq ymm0, ymm0, 14; temp = temp >> 14 * 8;
207    vpermq ymm8, ymm3, 0x4e
208    vpand ymm8, ymm8, ymm7;
209    vpalignr ymm2, ymm3, ymm8, 14
210    vpand ymm2, ymm2, ymm7
211    vpaddw ymm2, ymm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128);
212    vpaddw ymm2, ymm3 ; new_pix = _mm_add_epi16(new_pix, temp);
213    vperm2i128 ymm1, ymm3, ymm6, 0x21
214    vpsrldq ymm1, ymm1, 14; temp = temp << 2 * 8;
215    vmovdqu ymm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x);
216    vmovdqu [r4 + r6 * 2], ymm2 ; *(col_pix_buf+x) = new_pix ;
217    vmovdqu ymm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x);
218    vpaddw ymm3, ymm2, ymm4
219    vmovdqu [r12 + r6 * 2], ymm3 ; *(col_sum_buf+x) = temp;
220    vpaddw ymm5, ymm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp);
221    vpsrlw ymm5, 4 ; old_col_sum = old_col_sum >> 4;
222    vpackuswb ymm5, ymm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
223    vpermq ymm5, ymm5, 11_01_10_00b
224    vmovdqu [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum;
225    add r6, 16; x += 16;
226    cmp r6, r14; x < ((w - 2) & (~15));
227    jl .width_loop
228    movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1];
229    movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
230    add r9, r8
231    jmp .final_width_check
232.final_width_loop
233    movzx r10, byte [r7 + r6] ; temp1 = src[x];
234    lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
235    mov r8, r10 ; old_pix = temp1;
236    lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
237    mov r9, r11 ; old_sum = temp2;
238    movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
239    add r11, r10 ; temp2 += temp1;
240    mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
241    movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x];
242    add r10, r11 ; temp1 += temp2;
243    shr r10, 4 ; temp1 >>= 4;
244    mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
245    mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
246    inc r6 ; x++
247.final_width_check
248    cmp r6, r1 ; x < w
249    jl .final_width_loop
250    inc r5 ; y++;
251    cmp r5, r2 ; y < h;
252    jl .height_loop
253    RET
254
255