1;******************************************************************************
2;* SIMD-optimized clear block functions
3;* Copyright (c) 2002 Michael Niedermayer
4;* Copyright (c) 2008 Loren Merritt
5;* Copyright (c) 2009 Fiona Glaser
6;*
7;* AVX version by Jokyo Images
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26%include "libavutil/x86/x86util.asm"
27
28SECTION .text
29
30;----------------------------------------
31; void ff_clear_block(int16_t *blocks);
32;----------------------------------------
33; %1 = number of xmm registers used
34; %2 = number of inline store loops
35%macro CLEAR_BLOCK 2
36cglobal clear_block, 1, 1, %1, blocks
37    ZERO  m0, m0, m0
38%assign %%i 0
39%rep %2
40    mova  [blocksq+mmsize*(0+%%i)], m0
41    mova  [blocksq+mmsize*(1+%%i)], m0
42    mova  [blocksq+mmsize*(2+%%i)], m0
43    mova  [blocksq+mmsize*(3+%%i)], m0
44%assign %%i %%i+4
45%endrep
46    RET
47%endmacro
48
49INIT_MMX mmx
50%define ZERO pxor
51CLEAR_BLOCK 0, 4
52INIT_XMM sse
53%define ZERO xorps
54CLEAR_BLOCK 1, 2
55INIT_YMM avx
56CLEAR_BLOCK 1, 1
57
58;-----------------------------------------
59; void ff_clear_blocks(int16_t *blocks);
60;-----------------------------------------
61; %1 = number of xmm registers used
62%macro CLEAR_BLOCKS 1
63cglobal clear_blocks, 1, 2, %1, blocks, len
64    add   blocksq, 768
65    mov      lenq, -768
66    ZERO       m0, m0, m0
67.loop:
68    mova  [blocksq+lenq+mmsize*0], m0
69    mova  [blocksq+lenq+mmsize*1], m0
70    mova  [blocksq+lenq+mmsize*2], m0
71    mova  [blocksq+lenq+mmsize*3], m0
72    mova  [blocksq+lenq+mmsize*4], m0
73    mova  [blocksq+lenq+mmsize*5], m0
74    mova  [blocksq+lenq+mmsize*6], m0
75    mova  [blocksq+lenq+mmsize*7], m0
76    add   lenq, mmsize*8
77    js .loop
78    RET
79%endmacro
80
81INIT_MMX mmx
82%define ZERO pxor
83CLEAR_BLOCKS 0
84INIT_XMM sse
85%define ZERO xorps
86CLEAR_BLOCKS 1
87INIT_YMM avx
88CLEAR_BLOCKS 1
89