1 /*
2  * Copyright © 2018, Niklas Haas
3  * Copyright © 2018, VideoLAN and dav1d authors
4  * Copyright © 2018, Two Orioles, LLC
5  * Copyright © 2021, Martin Storsjo
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright notice, this
12  *    list of conditions and the following disclaimer.
13  *
14  * 2. Redistributions in binary form must reproduce the above copyright notice,
15  *    this list of conditions and the following disclaimer in the documentation
16  *    and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 #include "src/cpu.h"
31 #include "src/film_grain.h"
32 #include "asm-offsets.h"
33 
34 #if ARCH_AARCH64
35 
36 CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED);
37 CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG);
38 CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y);
39 CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_uv, FGD_AR_COEFFS_UV);
40 CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_shift, FGD_AR_COEFF_SHIFT);
41 CHECK_OFFSET(Dav1dFilmGrainData, grain_scale_shift, FGD_GRAIN_SCALE_SHIFT);
42 
43 CHECK_OFFSET(Dav1dFilmGrainData, scaling_shift, FGD_SCALING_SHIFT);
44 CHECK_OFFSET(Dav1dFilmGrainData, uv_mult, FGD_UV_MULT);
45 CHECK_OFFSET(Dav1dFilmGrainData, uv_luma_mult, FGD_UV_LUMA_MULT);
46 CHECK_OFFSET(Dav1dFilmGrainData, uv_offset, FGD_UV_OFFSET);
47 CHECK_OFFSET(Dav1dFilmGrainData, clip_to_restricted_range, FGD_CLIP_TO_RESTRICTED_RANGE);
48 
49 void BF(dav1d_generate_grain_y, neon)(entry buf[][GRAIN_WIDTH],
50                                       const Dav1dFilmGrainData *const data
51                                       HIGHBD_DECL_SUFFIX);
52 
53 #define GEN_GRAIN_UV(suff) \
54 void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
55                                                 const entry buf_y[][GRAIN_WIDTH], \
56                                                 const Dav1dFilmGrainData *const data, \
57                                                 const intptr_t uv \
58                                                 HIGHBD_DECL_SUFFIX)
59 
60 GEN_GRAIN_UV(420);
61 GEN_GRAIN_UV(422);
62 GEN_GRAIN_UV(444);
63 #endif
64 
65 // Use ptrdiff_t instead of int for the last few parameters, to get the
66 // same layout of parameters on the stack across platforms.
67 void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
68                                const pixel *const src,
69                                const ptrdiff_t stride,
70                                const uint8_t scaling[SCALING_SIZE],
71                                const int scaling_shift,
72                                const entry grain_lut[][GRAIN_WIDTH],
73                                const int offsets[][2],
74                                const int h, const ptrdiff_t clip,
75                                const ptrdiff_t type
76                                HIGHBD_DECL_SUFFIX);
77 
78 // Use ptrdiff_t instead of int for the last few parameters, to get the
79 // parameters on the stack with the same layout across platforms.
80 #define FGUV(suff) \
81 void BF(dav1d_fguv_32x32_ ## suff, neon)(pixel *const dst, \
82                                          const pixel *const src, \
83                                          const ptrdiff_t stride, \
84                                          const uint8_t scaling[SCALING_SIZE], \
85                                          const Dav1dFilmGrainData *const data, \
86                                          const entry grain_lut[][GRAIN_WIDTH], \
87                                          const pixel *const luma_row, \
88                                          const ptrdiff_t luma_stride, \
89                                          const int offsets[][2], \
90                                          const ptrdiff_t h, const ptrdiff_t uv, \
91                                          const ptrdiff_t is_id, \
92                                          const ptrdiff_t type \
93                                          HIGHBD_DECL_SUFFIX)
94 
95 FGUV(420);
96 FGUV(422);
97 FGUV(444);
98 
get_random_number(const int bits,unsigned * const state)99 static inline int get_random_number(const int bits, unsigned *const state) {
100     const int r = *state;
101     unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
102     *state = (r >> 1) | (bit << 15);
103 
104     return (*state >> (16 - bits)) & ((1 << bits) - 1);
105 }
106 
fgy_32x32xn_neon(pixel * const dst_row,const pixel * const src_row,const ptrdiff_t stride,const Dav1dFilmGrainData * const data,const size_t pw,const uint8_t scaling[SCALING_SIZE],const entry grain_lut[][GRAIN_WIDTH],const int bh,const int row_num HIGHBD_DECL_SUFFIX)107 static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
108                              const ptrdiff_t stride,
109                              const Dav1dFilmGrainData *const data, const size_t pw,
110                              const uint8_t scaling[SCALING_SIZE],
111                              const entry grain_lut[][GRAIN_WIDTH],
112                              const int bh, const int row_num HIGHBD_DECL_SUFFIX)
113 {
114     const int rows = 1 + (data->overlap_flag && row_num > 0);
115 
116     // seed[0] contains the current row, seed[1] contains the previous
117     unsigned seed[2];
118     for (int i = 0; i < rows; i++) {
119         seed[i] = data->seed;
120         seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
121         seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
122     }
123 
124     int offsets[2 /* col offset */][2 /* row offset */];
125 
126     // process this row in BLOCK_SIZE^2 blocks
127     for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
128 
129         if (data->overlap_flag && bx) {
130             // shift previous offsets left
131             for (int i = 0; i < rows; i++)
132                 offsets[1][i] = offsets[0][i];
133         }
134 
135         // update current offsets
136         for (int i = 0; i < rows; i++)
137             offsets[0][i] = get_random_number(8, &seed[i]);
138 
139         int type = 0;
140         if (data->overlap_flag && row_num)
141             type |= 1; /* overlap y */
142         if (data->overlap_flag && bx)
143             type |= 2; /* overlap x */
144 
145         BF(dav1d_fgy_32x32, neon)(dst_row + bx, src_row + bx, stride,
146                                   scaling, data->scaling_shift,
147                                   grain_lut, offsets, bh,
148                                   data->clip_to_restricted_range, type
149                                   HIGHBD_TAIL_SUFFIX);
150     }
151 }
152 
153 #define fguv_ss_fn(nm, sx, sy) \
154 static void \
155 fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
156                   const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
157                   const int pw, const uint8_t scaling[SCALING_SIZE], \
158                   const entry grain_lut[][GRAIN_WIDTH], const int bh, \
159                   const int row_num, const pixel *const luma_row, \
160                   const ptrdiff_t luma_stride, const int uv, const int is_id \
161                   HIGHBD_DECL_SUFFIX) \
162 { \
163     const int rows = 1 + (data->overlap_flag && row_num > 0); \
164  \
165     /* seed[0] contains the current row, seed[1] contains the previous */ \
166     unsigned seed[2]; \
167     for (int i = 0; i < rows; i++) { \
168         seed[i] = data->seed; \
169         seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8; \
170         seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); \
171     } \
172  \
173     int offsets[2 /* col offset */][2 /* row offset */]; \
174  \
175     /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
176     for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
177         if (data->overlap_flag && bx) { \
178             /* shift previous offsets left */ \
179             for (int i = 0; i < rows; i++) \
180                 offsets[1][i] = offsets[0][i]; \
181         } \
182  \
183         /* update current offsets */ \
184         for (int i = 0; i < rows; i++) \
185             offsets[0][i] = get_random_number(8, &seed[i]); \
186  \
187         int type = 0; \
188         if (data->overlap_flag && row_num) \
189             type |= 1; /* overlap y */ \
190         if (data->overlap_flag && bx) \
191             type |= 2; /* overlap x */ \
192         if (data->chroma_scaling_from_luma) \
193             type |= 4; \
194  \
195         BF(dav1d_fguv_32x32_##nm, neon)(dst_row + bx, src_row + bx, stride, \
196                                         scaling, data, grain_lut, \
197                                         luma_row + (bx << sx), luma_stride, \
198                                         offsets, bh, uv, is_id, type \
199                                         HIGHBD_TAIL_SUFFIX); \
200     } \
201 }
202 
203 fguv_ss_fn(420, 1, 1);
204 fguv_ss_fn(422, 1, 0);
205 fguv_ss_fn(444, 0, 0);
206 
bitfn(dav1d_film_grain_dsp_init_arm)207 COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) {
208     const unsigned flags = dav1d_get_cpu_flags();
209 
210     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
211 
212 #if ARCH_AARCH64 && BITDEPTH == 8
213     c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
214     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
215     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
216     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon);
217 #endif
218 
219     c->fgy_32x32xn = fgy_32x32xn_neon;
220     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
221     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
222     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
223 }
224