1;******************************************************************************
2;* SIMD-optimized HuffYUV functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2014 Christophe Gisquet
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
24    movd    xm4, maskd
25    SPLATW  m4, xm4
26    add     wd, wd
27    test    wq, 2*mmsize - 1
28    jz %%.tomainloop
29    push  tmpq
30%%.wordloop:
31    sub     wq, 2
32%ifidn %2, add
33    mov   tmpw, [srcq+wq]
34    add   tmpw, [dstq+wq]
35%else
36    mov   tmpw, [src1q+wq]
37    sub   tmpw, [src2q+wq]
38%endif
39    and   tmpw, maskw
40    mov     [dstq+wq], tmpw
41    test    wq, 2*mmsize - 1
42    jnz %%.wordloop
43    pop   tmpq
44%%.tomainloop:
45%ifidn %2, add
46    add     srcq, wq
47%else
48    add     src1q, wq
49    add     src2q, wq
50%endif
51    add     dstq, wq
52    neg     wq
53    jz      %%.end
54%%.loop:
55%ifidn %2, add
56    mov%1   m0, [srcq+wq]
57    mov%1   m1, [dstq+wq]
58    mov%1   m2, [srcq+wq+mmsize]
59    mov%1   m3, [dstq+wq+mmsize]
60%else
61    mov%1   m0, [src1q+wq]
62    mov%1   m1, [src2q+wq]
63    mov%1   m2, [src1q+wq+mmsize]
64    mov%1   m3, [src2q+wq+mmsize]
65%endif
66    p%2w    m0, m1
67    p%2w    m2, m3
68    pand    m0, m4
69    pand    m2, m4
70    mov%1   [dstq+wq]       , m0
71    mov%1   [dstq+wq+mmsize], m2
72    add     wq, 2*mmsize
73    jl %%.loop
74%%.end:
75    RET
76%endmacro
77