1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/obmc.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27 
28 #include "src/dsp/arm/common_neon.h"
29 #include "src/dsp/constants.h"
30 #include "src/dsp/dsp.h"
31 #include "src/utils/common.h"
32 
33 namespace libgav1 {
34 namespace dsp {
35 namespace {
36 #include "src/dsp/obmc.inc"
37 
38 }  // namespace
39 
40 namespace low_bitdepth {
41 namespace {
42 
WriteObmcLine4(uint8_t * LIBGAV1_RESTRICT const pred,const uint8_t * LIBGAV1_RESTRICT const obmc_pred,const uint8x8_t pred_mask,const uint8x8_t obmc_pred_mask)43 inline void WriteObmcLine4(uint8_t* LIBGAV1_RESTRICT const pred,
44                            const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
45                            const uint8x8_t pred_mask,
46                            const uint8x8_t obmc_pred_mask) {
47   const uint8x8_t pred_val = Load4(pred);
48   const uint8x8_t obmc_pred_val = Load4(obmc_pred);
49   const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
50   const uint8x8_t result =
51       vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
52   StoreLo4(pred, result);
53 }
54 
OverlapBlendFromLeft2xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)55 inline void OverlapBlendFromLeft2xH_NEON(
56     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
57     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
58     const ptrdiff_t obmc_prediction_stride) {
59   const uint8x8_t mask_inverter = vdup_n_u8(64);
60   const uint8x8_t pred_mask = Load2(kObmcMask);
61   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
62   uint8x8_t pred_val = vdup_n_u8(0);
63   uint8x8_t obmc_pred_val = vdup_n_u8(0);
64   int y = 0;
65   do {
66     pred_val = Load2<0>(pred, pred_val);
67     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
68     obmc_pred_val = Load2<0>(obmc_pred, obmc_pred_val);
69     const uint8x8_t result =
70         vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
71     Store2<0>(pred, result);
72 
73     pred += prediction_stride;
74     obmc_pred += obmc_prediction_stride;
75   } while (++y != height);
76 }
77 
OverlapBlendFromLeft4xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)78 inline void OverlapBlendFromLeft4xH_NEON(
79     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
80     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
81     const ptrdiff_t obmc_prediction_stride) {
82   const uint8x8_t mask_inverter = vdup_n_u8(64);
83   const uint8x8_t pred_mask = Load4(kObmcMask + 2);
84   // 64 - mask
85   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
86   int y = 0;
87   do {
88     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
89     pred += prediction_stride;
90     obmc_pred += obmc_prediction_stride;
91 
92     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
93     pred += prediction_stride;
94     obmc_pred += obmc_prediction_stride;
95 
96     y += 2;
97   } while (y != height);
98 }
99 
OverlapBlendFromLeft8xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)100 inline void OverlapBlendFromLeft8xH_NEON(
101     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
102     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
103     const ptrdiff_t obmc_prediction_stride) {
104   const uint8x8_t mask_inverter = vdup_n_u8(64);
105   const uint8x8_t pred_mask = vld1_u8(kObmcMask + 6);
106   // 64 - mask
107   const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
108   int y = 0;
109   do {
110     const uint8x8_t pred_val = vld1_u8(pred);
111     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
112     const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
113     const uint8x8_t result =
114         vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
115 
116     vst1_u8(pred, result);
117     pred += prediction_stride;
118     obmc_pred += obmc_prediction_stride;
119   } while (++y != height);
120 }
121 
OverlapBlendFromLeft_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)122 void OverlapBlendFromLeft_NEON(
123     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
124     const int width, const int height,
125     const void* LIBGAV1_RESTRICT const obmc_prediction,
126     const ptrdiff_t obmc_prediction_stride) {
127   auto* pred = static_cast<uint8_t*>(prediction);
128   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
129   assert(width >= 2);
130   assert(height >= 4);
131 
132   if (width == 2) {
133     OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
134                                  obmc_prediction_stride);
135     return;
136   }
137   if (width == 4) {
138     OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
139                                  obmc_prediction_stride);
140     return;
141   }
142   if (width == 8) {
143     OverlapBlendFromLeft8xH_NEON(pred, prediction_stride, height, obmc_pred,
144                                  obmc_prediction_stride);
145     return;
146   }
147   const uint8x16_t mask_inverter = vdupq_n_u8(64);
148   const uint8_t* mask = kObmcMask + width - 2;
149   int x = 0;
150   do {
151     pred = static_cast<uint8_t*>(prediction) + x;
152     obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
153     const uint8x16_t pred_mask = vld1q_u8(mask + x);
154     // 64 - mask
155     const uint8x16_t obmc_pred_mask = vsubq_u8(mask_inverter, pred_mask);
156     int y = 0;
157     do {
158       const uint8x16_t pred_val = vld1q_u8(pred);
159       const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred);
160       const uint16x8_t weighted_pred_lo =
161           vmull_u8(vget_low_u8(pred_mask), vget_low_u8(pred_val));
162       const uint8x8_t result_lo =
163           vrshrn_n_u16(vmlal_u8(weighted_pred_lo, vget_low_u8(obmc_pred_mask),
164                                 vget_low_u8(obmc_pred_val)),
165                        6);
166       const uint16x8_t weighted_pred_hi =
167           vmull_u8(vget_high_u8(pred_mask), vget_high_u8(pred_val));
168       const uint8x8_t result_hi =
169           vrshrn_n_u16(vmlal_u8(weighted_pred_hi, vget_high_u8(obmc_pred_mask),
170                                 vget_high_u8(obmc_pred_val)),
171                        6);
172       vst1q_u8(pred, vcombine_u8(result_lo, result_hi));
173 
174       pred += prediction_stride;
175       obmc_pred += obmc_prediction_stride;
176     } while (++y < height);
177     x += 16;
178   } while (x < width);
179 }
180 
OverlapBlendFromTop4x4_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride,const int height)181 inline void OverlapBlendFromTop4x4_NEON(
182     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
183     const uint8_t* LIBGAV1_RESTRICT obmc_pred,
184     const ptrdiff_t obmc_prediction_stride, const int height) {
185   uint8x8_t pred_mask = vdup_n_u8(kObmcMask[height - 2]);
186   const uint8x8_t mask_inverter = vdup_n_u8(64);
187   uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
188   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
189   pred += prediction_stride;
190   obmc_pred += obmc_prediction_stride;
191 
192   if (height == 2) {
193     return;
194   }
195 
196   pred_mask = vdup_n_u8(kObmcMask[3]);
197   obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
198   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
199   pred += prediction_stride;
200   obmc_pred += obmc_prediction_stride;
201 
202   pred_mask = vdup_n_u8(kObmcMask[4]);
203   obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
204   WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
205 }
206 
OverlapBlendFromTop4xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)207 inline void OverlapBlendFromTop4xH_NEON(
208     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
209     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
210     const ptrdiff_t obmc_prediction_stride) {
211   if (height < 8) {
212     OverlapBlendFromTop4x4_NEON(pred, prediction_stride, obmc_pred,
213                                 obmc_prediction_stride, height);
214     return;
215   }
216   const uint8_t* mask = kObmcMask + height - 2;
217   const uint8x8_t mask_inverter = vdup_n_u8(64);
218   int y = 0;
219   // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
220   // lines are unchanged as the corresponding mask value is 64.
221   do {
222     uint8x8_t pred_mask = vdup_n_u8(mask[y]);
223     uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
224     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
225     pred += prediction_stride;
226     obmc_pred += obmc_prediction_stride;
227 
228     pred_mask = vdup_n_u8(mask[y + 1]);
229     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
230     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
231     pred += prediction_stride;
232     obmc_pred += obmc_prediction_stride;
233 
234     pred_mask = vdup_n_u8(mask[y + 2]);
235     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
236     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
237     pred += prediction_stride;
238     obmc_pred += obmc_prediction_stride;
239 
240     pred_mask = vdup_n_u8(mask[y + 3]);
241     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
242     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
243     pred += prediction_stride;
244     obmc_pred += obmc_prediction_stride;
245 
246     pred_mask = vdup_n_u8(mask[y + 4]);
247     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
248     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
249     pred += prediction_stride;
250     obmc_pred += obmc_prediction_stride;
251 
252     pred_mask = vdup_n_u8(mask[y + 5]);
253     obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
254     WriteObmcLine4(pred, obmc_pred, pred_mask, obmc_pred_mask);
255     pred += prediction_stride;
256     obmc_pred += obmc_prediction_stride;
257 
258     // Increment for the right mask index.
259     y += 6;
260   } while (y < height - 4);
261 }
262 
OverlapBlendFromTop8xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)263 inline void OverlapBlendFromTop8xH_NEON(
264     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
265     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
266     const ptrdiff_t obmc_prediction_stride) {
267   const uint8x8_t mask_inverter = vdup_n_u8(64);
268   const uint8_t* mask = kObmcMask + height - 2;
269   const int compute_height = height - (height >> 2);
270   int y = 0;
271   do {
272     const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
273     // 64 - mask
274     const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
275     const uint8x8_t pred_val = vld1_u8(pred);
276     const uint16x8_t weighted_pred = vmull_u8(pred_mask, pred_val);
277     const uint8x8_t obmc_pred_val = vld1_u8(obmc_pred);
278     const uint8x8_t result =
279         vrshrn_n_u16(vmlal_u8(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
280 
281     vst1_u8(pred, result);
282     pred += prediction_stride;
283     obmc_pred += obmc_prediction_stride;
284   } while (++y != compute_height);
285 }
286 
OverlapBlendFromTop_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)287 void OverlapBlendFromTop_NEON(
288     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
289     const int width, const int height,
290     const void* LIBGAV1_RESTRICT const obmc_prediction,
291     const ptrdiff_t obmc_prediction_stride) {
292   auto* pred = static_cast<uint8_t*>(prediction);
293   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
294   assert(width >= 4);
295   assert(height >= 2);
296 
297   if (width == 4) {
298     OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
299                                 obmc_prediction_stride);
300     return;
301   }
302 
303   if (width == 8) {
304     OverlapBlendFromTop8xH_NEON(pred, prediction_stride, height, obmc_pred,
305                                 obmc_prediction_stride);
306     return;
307   }
308 
309   const uint8_t* mask = kObmcMask + height - 2;
310   const uint8x8_t mask_inverter = vdup_n_u8(64);
311   // Stop when mask value becomes 64. This is inferred for 4xH.
312   const int compute_height = height - (height >> 2);
313   int y = 0;
314   do {
315     const uint8x8_t pred_mask = vdup_n_u8(mask[y]);
316     // 64 - mask
317     const uint8x8_t obmc_pred_mask = vsub_u8(mask_inverter, pred_mask);
318     int x = 0;
319     do {
320       const uint8x16_t pred_val = vld1q_u8(pred + x);
321       const uint8x16_t obmc_pred_val = vld1q_u8(obmc_pred + x);
322       const uint16x8_t weighted_pred_lo =
323           vmull_u8(pred_mask, vget_low_u8(pred_val));
324       const uint8x8_t result_lo =
325           vrshrn_n_u16(vmlal_u8(weighted_pred_lo, obmc_pred_mask,
326                                 vget_low_u8(obmc_pred_val)),
327                        6);
328       const uint16x8_t weighted_pred_hi =
329           vmull_u8(pred_mask, vget_high_u8(pred_val));
330       const uint8x8_t result_hi =
331           vrshrn_n_u16(vmlal_u8(weighted_pred_hi, obmc_pred_mask,
332                                 vget_high_u8(obmc_pred_val)),
333                        6);
334       vst1q_u8(pred + x, vcombine_u8(result_lo, result_hi));
335 
336       x += 16;
337     } while (x < width);
338     pred += prediction_stride;
339     obmc_pred += obmc_prediction_stride;
340   } while (++y < compute_height);
341 }
342 
Init8bpp()343 void Init8bpp() {
344   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
345   assert(dsp != nullptr);
346   dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
347   dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
348 }
349 
350 }  // namespace
351 }  // namespace low_bitdepth
352 
353 #if LIBGAV1_MAX_BITDEPTH >= 10
354 namespace high_bitdepth {
355 namespace {
356 
357 // This is a flat array of masks for each block dimension from 2 to 32. The
358 // starting index for each length is length-2. The value 64 leaves the result
359 // equal to |pred| and may be ignored if convenient. Vector loads may overrread
360 // values meant for larger sizes, but these values will be unused.
361 constexpr uint16_t kObmcMask[62] = {
362     // Obmc Mask 2
363     45, 64,
364     // Obmc Mask 4
365     39, 50, 59, 64,
366     // Obmc Mask 8
367     36, 42, 48, 53, 57, 61, 64, 64,
368     // Obmc Mask 16
369     34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64,
370     // Obmc Mask 32
371     33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55, 56, 57, 58,
372     59, 60, 60, 61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
373 
BlendObmc2Or4(uint8_t * LIBGAV1_RESTRICT const pred,const uint8_t * LIBGAV1_RESTRICT const obmc_pred,const uint16x4_t pred_mask,const uint16x4_t obmc_pred_mask)374 inline uint16x4_t BlendObmc2Or4(uint8_t* LIBGAV1_RESTRICT const pred,
375                                 const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
376                                 const uint16x4_t pred_mask,
377                                 const uint16x4_t obmc_pred_mask) {
378   const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
379   const uint16x4_t obmc_pred_val =
380       vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
381   const uint16x4_t weighted_pred = vmul_u16(pred_mask, pred_val);
382   const uint16x4_t result =
383       vrshr_n_u16(vmla_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
384   return result;
385 }
386 
BlendObmc8(uint8_t * LIBGAV1_RESTRICT const pred,const uint8_t * LIBGAV1_RESTRICT const obmc_pred,const uint16x8_t pred_mask,const uint16x8_t obmc_pred_mask)387 inline uint16x8_t BlendObmc8(uint8_t* LIBGAV1_RESTRICT const pred,
388                              const uint8_t* LIBGAV1_RESTRICT const obmc_pred,
389                              const uint16x8_t pred_mask,
390                              const uint16x8_t obmc_pred_mask) {
391   const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
392   const uint16x8_t obmc_pred_val =
393       vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
394   const uint16x8_t weighted_pred = vmulq_u16(pred_mask, pred_val);
395   const uint16x8_t result =
396       vrshrq_n_u16(vmlaq_u16(weighted_pred, obmc_pred_mask, obmc_pred_val), 6);
397   return result;
398 }
399 
OverlapBlendFromLeft2xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)400 inline void OverlapBlendFromLeft2xH_NEON(
401     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
402     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
403     const ptrdiff_t obmc_prediction_stride) {
404   const uint16x4_t mask_inverter = vdup_n_u16(64);
405   // Second two lanes unused.
406   const uint16x4_t pred_mask = vld1_u16(kObmcMask);
407   const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
408   int y = 0;
409   do {
410     const uint16x4_t result_0 =
411         BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
412     Store2<0>(reinterpret_cast<uint16_t*>(pred), result_0);
413 
414     pred += prediction_stride;
415     obmc_pred += obmc_prediction_stride;
416 
417     const uint16x4_t result_1 =
418         BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
419     Store2<0>(reinterpret_cast<uint16_t*>(pred), result_1);
420 
421     pred += prediction_stride;
422     obmc_pred += obmc_prediction_stride;
423 
424     y += 2;
425   } while (y != height);
426 }
427 
OverlapBlendFromLeft4xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)428 inline void OverlapBlendFromLeft4xH_NEON(
429     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
430     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
431     const ptrdiff_t obmc_prediction_stride) {
432   const uint16x4_t mask_inverter = vdup_n_u16(64);
433   const uint16x4_t pred_mask = vld1_u16(kObmcMask + 2);
434   // 64 - mask
435   const uint16x4_t obmc_pred_mask = vsub_u16(mask_inverter, pred_mask);
436   int y = 0;
437   do {
438     const uint16x4_t result_0 =
439         BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
440     vst1_u16(reinterpret_cast<uint16_t*>(pred), result_0);
441     pred += prediction_stride;
442     obmc_pred += obmc_prediction_stride;
443 
444     const uint16x4_t result_1 =
445         BlendObmc2Or4(pred, obmc_pred, pred_mask, obmc_pred_mask);
446     vst1_u16(reinterpret_cast<uint16_t*>(pred), result_1);
447     pred += prediction_stride;
448     obmc_pred += obmc_prediction_stride;
449 
450     y += 2;
451   } while (y != height);
452 }
453 
OverlapBlendFromLeft_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)454 void OverlapBlendFromLeft_NEON(
455     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
456     const int width, const int height,
457     const void* LIBGAV1_RESTRICT const obmc_prediction,
458     const ptrdiff_t obmc_prediction_stride) {
459   auto* pred = static_cast<uint8_t*>(prediction);
460   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
461   assert(width >= 2);
462   assert(height >= 4);
463 
464   if (width == 2) {
465     OverlapBlendFromLeft2xH_NEON(pred, prediction_stride, height, obmc_pred,
466                                  obmc_prediction_stride);
467     return;
468   }
469   if (width == 4) {
470     OverlapBlendFromLeft4xH_NEON(pred, prediction_stride, height, obmc_pred,
471                                  obmc_prediction_stride);
472     return;
473   }
474   const uint16x8_t mask_inverter = vdupq_n_u16(64);
475   const uint16_t* mask = kObmcMask + width - 2;
476   int x = 0;
477   do {
478     pred = reinterpret_cast<uint8_t*>(static_cast<uint16_t*>(prediction) + x);
479     obmc_pred = reinterpret_cast<const uint8_t*>(
480         static_cast<const uint16_t*>(obmc_prediction) + x);
481     const uint16x8_t pred_mask = vld1q_u16(mask + x);
482     // 64 - mask
483     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
484     int y = 0;
485     do {
486       const uint16x8_t result =
487           BlendObmc8(pred, obmc_pred, pred_mask, obmc_pred_mask);
488       vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
489 
490       pred += prediction_stride;
491       obmc_pred += obmc_prediction_stride;
492     } while (++y < height);
493     x += 8;
494   } while (x < width);
495 }
496 
497 template <int lane>
BlendObmcFromTop4(uint8_t * LIBGAV1_RESTRICT const pred,const uint8_t * LIBGAV1_RESTRICT const obmc_pred,const uint16x8_t pred_mask,const uint16x8_t obmc_pred_mask)498 inline uint16x4_t BlendObmcFromTop4(
499     uint8_t* LIBGAV1_RESTRICT const pred,
500     const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
501     const uint16x8_t obmc_pred_mask) {
502   const uint16x4_t pred_val = vld1_u16(reinterpret_cast<uint16_t*>(pred));
503   const uint16x4_t obmc_pred_val =
504       vld1_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
505   const uint16x4_t weighted_pred = VMulLaneQU16<lane>(pred_val, pred_mask);
506   const uint16x4_t result = vrshr_n_u16(
507       VMlaLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
508   return result;
509 }
510 
511 template <int lane>
BlendObmcFromTop8(uint8_t * LIBGAV1_RESTRICT const pred,const uint8_t * LIBGAV1_RESTRICT const obmc_pred,const uint16x8_t pred_mask,const uint16x8_t obmc_pred_mask)512 inline uint16x8_t BlendObmcFromTop8(
513     uint8_t* LIBGAV1_RESTRICT const pred,
514     const uint8_t* LIBGAV1_RESTRICT const obmc_pred, const uint16x8_t pred_mask,
515     const uint16x8_t obmc_pred_mask) {
516   const uint16x8_t pred_val = vld1q_u16(reinterpret_cast<uint16_t*>(pred));
517   const uint16x8_t obmc_pred_val =
518       vld1q_u16(reinterpret_cast<const uint16_t*>(obmc_pred));
519   const uint16x8_t weighted_pred = VMulQLaneQU16<lane>(pred_val, pred_mask);
520   const uint16x8_t result = vrshrq_n_u16(
521       VMlaQLaneQU16<lane>(weighted_pred, obmc_pred_val, obmc_pred_mask), 6);
522   return result;
523 }
524 
OverlapBlendFromTop4x2Or4_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride,const int height)525 inline void OverlapBlendFromTop4x2Or4_NEON(
526     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
527     const uint8_t* LIBGAV1_RESTRICT obmc_pred,
528     const ptrdiff_t obmc_prediction_stride, const int height) {
529   const uint16x8_t pred_mask = vld1q_u16(&kObmcMask[height - 2]);
530   const uint16x8_t mask_inverter = vdupq_n_u16(64);
531   const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
532   uint16x4_t result =
533       BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
534   vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
535   pred += prediction_stride;
536   obmc_pred += obmc_prediction_stride;
537 
538   if (height == 2) {
539     // Mask value is 64, meaning |pred| is unchanged.
540     return;
541   }
542 
543   result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
544   vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
545   pred += prediction_stride;
546   obmc_pred += obmc_prediction_stride;
547 
548   result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
549   vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
550 }
551 
OverlapBlendFromTop4xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride)552 inline void OverlapBlendFromTop4xH_NEON(
553     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
554     const int height, const uint8_t* LIBGAV1_RESTRICT obmc_pred,
555     const ptrdiff_t obmc_prediction_stride) {
556   if (height < 8) {
557     OverlapBlendFromTop4x2Or4_NEON(pred, prediction_stride, obmc_pred,
558                                    obmc_prediction_stride, height);
559     return;
560   }
561   const uint16_t* mask = kObmcMask + height - 2;
562   const uint16x8_t mask_inverter = vdupq_n_u16(64);
563   int y = 0;
564   // Compute 6 lines for height 8, or 12 lines for height 16. The remaining
565   // lines are unchanged as the corresponding mask value is 64.
566   do {
567     const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
568     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
569     uint16x4_t result =
570         BlendObmcFromTop4<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
571     vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
572     pred += prediction_stride;
573     obmc_pred += obmc_prediction_stride;
574 
575     result = BlendObmcFromTop4<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
576     vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
577     pred += prediction_stride;
578     obmc_pred += obmc_prediction_stride;
579 
580     result = BlendObmcFromTop4<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
581     vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
582     pred += prediction_stride;
583     obmc_pred += obmc_prediction_stride;
584 
585     result = BlendObmcFromTop4<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
586     vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
587     pred += prediction_stride;
588     obmc_pred += obmc_prediction_stride;
589 
590     result = BlendObmcFromTop4<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
591     vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
592     pred += prediction_stride;
593     obmc_pred += obmc_prediction_stride;
594 
595     result = BlendObmcFromTop4<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
596     vst1_u16(reinterpret_cast<uint16_t*>(pred), result);
597     pred += prediction_stride;
598     obmc_pred += obmc_prediction_stride;
599 
600     // Increment for the right mask index.
601     y += 6;
602   } while (y < height - 4);
603 }
604 
OverlapBlendFromTop8xH_NEON(uint8_t * LIBGAV1_RESTRICT pred,const ptrdiff_t prediction_stride,const uint8_t * LIBGAV1_RESTRICT obmc_pred,const ptrdiff_t obmc_prediction_stride,const int height)605 inline void OverlapBlendFromTop8xH_NEON(
606     uint8_t* LIBGAV1_RESTRICT pred, const ptrdiff_t prediction_stride,
607     const uint8_t* LIBGAV1_RESTRICT obmc_pred,
608     const ptrdiff_t obmc_prediction_stride, const int height) {
609   const uint16_t* mask = kObmcMask + height - 2;
610   const uint16x8_t mask_inverter = vdupq_n_u16(64);
611   uint16x8_t pred_mask = vld1q_u16(mask);
612   uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
613   uint16x8_t result =
614       BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
615   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
616   if (height == 2) return;
617 
618   pred += prediction_stride;
619   obmc_pred += obmc_prediction_stride;
620 
621   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
622   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
623   pred += prediction_stride;
624   obmc_pred += obmc_prediction_stride;
625 
626   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
627   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
628   pred += prediction_stride;
629   obmc_pred += obmc_prediction_stride;
630 
631   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
632   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
633   if (height == 4) return;
634 
635   pred += prediction_stride;
636   obmc_pred += obmc_prediction_stride;
637 
638   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
639   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
640   pred += prediction_stride;
641   obmc_pred += obmc_prediction_stride;
642 
643   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
644   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
645 
646   if (height == 8) return;
647 
648   pred += prediction_stride;
649   obmc_pred += obmc_prediction_stride;
650 
651   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
652   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
653   pred += prediction_stride;
654   obmc_pred += obmc_prediction_stride;
655 
656   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
657   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
658   pred += prediction_stride;
659   obmc_pred += obmc_prediction_stride;
660 
661   pred_mask = vld1q_u16(&mask[8]);
662   obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
663 
664   result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
665   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
666   pred += prediction_stride;
667   obmc_pred += obmc_prediction_stride;
668 
669   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
670   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
671   pred += prediction_stride;
672   obmc_pred += obmc_prediction_stride;
673 
674   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
675   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
676   pred += prediction_stride;
677   obmc_pred += obmc_prediction_stride;
678 
679   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
680   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
681 
682   if (height == 16) return;
683 
684   pred += prediction_stride;
685   obmc_pred += obmc_prediction_stride;
686 
687   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
688   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
689   pred += prediction_stride;
690   obmc_pred += obmc_prediction_stride;
691 
692   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
693   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
694   pred += prediction_stride;
695   obmc_pred += obmc_prediction_stride;
696 
697   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
698   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
699   pred += prediction_stride;
700   obmc_pred += obmc_prediction_stride;
701 
702   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
703   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
704   pred += prediction_stride;
705   obmc_pred += obmc_prediction_stride;
706 
707   pred_mask = vld1q_u16(&mask[16]);
708   obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
709 
710   result = BlendObmcFromTop8<0>(pred, obmc_pred, pred_mask, obmc_pred_mask);
711   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
712   pred += prediction_stride;
713   obmc_pred += obmc_prediction_stride;
714 
715   result = BlendObmcFromTop8<1>(pred, obmc_pred, pred_mask, obmc_pred_mask);
716   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
717   pred += prediction_stride;
718   obmc_pred += obmc_prediction_stride;
719 
720   result = BlendObmcFromTop8<2>(pred, obmc_pred, pred_mask, obmc_pred_mask);
721   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
722   pred += prediction_stride;
723   obmc_pred += obmc_prediction_stride;
724 
725   result = BlendObmcFromTop8<3>(pred, obmc_pred, pred_mask, obmc_pred_mask);
726   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
727   pred += prediction_stride;
728   obmc_pred += obmc_prediction_stride;
729 
730   result = BlendObmcFromTop8<4>(pred, obmc_pred, pred_mask, obmc_pred_mask);
731   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
732   pred += prediction_stride;
733   obmc_pred += obmc_prediction_stride;
734 
735   result = BlendObmcFromTop8<5>(pred, obmc_pred, pred_mask, obmc_pred_mask);
736   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
737   pred += prediction_stride;
738   obmc_pred += obmc_prediction_stride;
739 
740   result = BlendObmcFromTop8<6>(pred, obmc_pred, pred_mask, obmc_pred_mask);
741   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
742   pred += prediction_stride;
743   obmc_pred += obmc_prediction_stride;
744 
745   result = BlendObmcFromTop8<7>(pred, obmc_pred, pred_mask, obmc_pred_mask);
746   vst1q_u16(reinterpret_cast<uint16_t*>(pred), result);
747 }
748 
OverlapBlendFromTop_NEON(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)749 void OverlapBlendFromTop_NEON(
750     void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
751     const int width, const int height,
752     const void* LIBGAV1_RESTRICT const obmc_prediction,
753     const ptrdiff_t obmc_prediction_stride) {
754   auto* pred = static_cast<uint8_t*>(prediction);
755   const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
756   assert(width >= 4);
757   assert(height >= 2);
758 
759   if (width == 4) {
760     OverlapBlendFromTop4xH_NEON(pred, prediction_stride, height, obmc_pred,
761                                 obmc_prediction_stride);
762     return;
763   }
764 
765   if (width == 8) {
766     OverlapBlendFromTop8xH_NEON(pred, prediction_stride, obmc_pred,
767                                 obmc_prediction_stride, height);
768     return;
769   }
770 
771   const uint16_t* mask = kObmcMask + height - 2;
772   const uint16x8_t mask_inverter = vdupq_n_u16(64);
773   const uint16x8_t pred_mask = vld1q_u16(mask);
774   // 64 - mask
775   const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
776 #define OBMC_ROW_FROM_TOP(n)                                                 \
777   do {                                                                       \
778     int x = 0;                                                               \
779     do {                                                                     \
780       const uint16x8_t result = BlendObmcFromTop8<n>(                        \
781           reinterpret_cast<uint8_t*>(reinterpret_cast<uint16_t*>(pred) + x), \
782           reinterpret_cast<const uint8_t*>(                                  \
783               reinterpret_cast<const uint16_t*>(obmc_pred) + x),             \
784           pred_mask, obmc_pred_mask);                                        \
785       vst1q_u16(reinterpret_cast<uint16_t*>(pred) + x, result);              \
786                                                                              \
787       x += 8;                                                                \
788     } while (x < width);                                                     \
789   } while (false)
790 
791   // Compute 1 row.
792   if (height == 2) {
793     OBMC_ROW_FROM_TOP(0);
794     return;
795   }
796 
797   // Compute 3 rows.
798   if (height == 4) {
799     OBMC_ROW_FROM_TOP(0);
800     pred += prediction_stride;
801     obmc_pred += obmc_prediction_stride;
802     OBMC_ROW_FROM_TOP(1);
803     pred += prediction_stride;
804     obmc_pred += obmc_prediction_stride;
805     OBMC_ROW_FROM_TOP(2);
806     return;
807   }
808 
809   // Compute 6 rows.
810   if (height == 8) {
811     OBMC_ROW_FROM_TOP(0);
812     pred += prediction_stride;
813     obmc_pred += obmc_prediction_stride;
814     OBMC_ROW_FROM_TOP(1);
815     pred += prediction_stride;
816     obmc_pred += obmc_prediction_stride;
817     OBMC_ROW_FROM_TOP(2);
818     pred += prediction_stride;
819     obmc_pred += obmc_prediction_stride;
820     OBMC_ROW_FROM_TOP(3);
821     pred += prediction_stride;
822     obmc_pred += obmc_prediction_stride;
823     OBMC_ROW_FROM_TOP(4);
824     pred += prediction_stride;
825     obmc_pred += obmc_prediction_stride;
826     OBMC_ROW_FROM_TOP(5);
827     return;
828   }
829 
830   // Compute 12 rows.
831   if (height == 16) {
832     OBMC_ROW_FROM_TOP(0);
833     pred += prediction_stride;
834     obmc_pred += obmc_prediction_stride;
835     OBMC_ROW_FROM_TOP(1);
836     pred += prediction_stride;
837     obmc_pred += obmc_prediction_stride;
838     OBMC_ROW_FROM_TOP(2);
839     pred += prediction_stride;
840     obmc_pred += obmc_prediction_stride;
841     OBMC_ROW_FROM_TOP(3);
842     pred += prediction_stride;
843     obmc_pred += obmc_prediction_stride;
844     OBMC_ROW_FROM_TOP(4);
845     pred += prediction_stride;
846     obmc_pred += obmc_prediction_stride;
847     OBMC_ROW_FROM_TOP(5);
848     pred += prediction_stride;
849     obmc_pred += obmc_prediction_stride;
850     OBMC_ROW_FROM_TOP(6);
851     pred += prediction_stride;
852     obmc_pred += obmc_prediction_stride;
853     OBMC_ROW_FROM_TOP(7);
854     pred += prediction_stride;
855     obmc_pred += obmc_prediction_stride;
856 
857     const uint16x8_t pred_mask = vld1q_u16(&mask[8]);
858     // 64 - mask
859     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
860     OBMC_ROW_FROM_TOP(0);
861     pred += prediction_stride;
862     obmc_pred += obmc_prediction_stride;
863     OBMC_ROW_FROM_TOP(1);
864     pred += prediction_stride;
865     obmc_pred += obmc_prediction_stride;
866     OBMC_ROW_FROM_TOP(2);
867     pred += prediction_stride;
868     obmc_pred += obmc_prediction_stride;
869     OBMC_ROW_FROM_TOP(3);
870     return;
871   }
872 
873   // Stop when mask value becomes 64. This is a multiple of 8 for height 32
874   // and 64.
875   const int compute_height = height - (height >> 2);
876   int y = 0;
877   do {
878     const uint16x8_t pred_mask = vld1q_u16(&mask[y]);
879     // 64 - mask
880     const uint16x8_t obmc_pred_mask = vsubq_u16(mask_inverter, pred_mask);
881     OBMC_ROW_FROM_TOP(0);
882     pred += prediction_stride;
883     obmc_pred += obmc_prediction_stride;
884     OBMC_ROW_FROM_TOP(1);
885     pred += prediction_stride;
886     obmc_pred += obmc_prediction_stride;
887     OBMC_ROW_FROM_TOP(2);
888     pred += prediction_stride;
889     obmc_pred += obmc_prediction_stride;
890     OBMC_ROW_FROM_TOP(3);
891     pred += prediction_stride;
892     obmc_pred += obmc_prediction_stride;
893     OBMC_ROW_FROM_TOP(4);
894     pred += prediction_stride;
895     obmc_pred += obmc_prediction_stride;
896     OBMC_ROW_FROM_TOP(5);
897     pred += prediction_stride;
898     obmc_pred += obmc_prediction_stride;
899     OBMC_ROW_FROM_TOP(6);
900     pred += prediction_stride;
901     obmc_pred += obmc_prediction_stride;
902     OBMC_ROW_FROM_TOP(7);
903     pred += prediction_stride;
904     obmc_pred += obmc_prediction_stride;
905 
906     y += 8;
907   } while (y < compute_height);
908 }
909 
Init10bpp()910 void Init10bpp() {
911   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
912   assert(dsp != nullptr);
913   dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_NEON;
914   dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_NEON;
915 }
916 
917 }  // namespace
918 }  // namespace high_bitdepth
919 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
920 
ObmcInit_NEON()921 void ObmcInit_NEON() {
922   low_bitdepth::Init8bpp();
923 #if LIBGAV1_MAX_BITDEPTH >= 10
924   high_bitdepth::Init10bpp();
925 #endif
926 }
927 
928 }  // namespace dsp
929 }  // namespace libgav1
930 
931 #else   // !LIBGAV1_ENABLE_NEON
932 
933 namespace libgav1 {
934 namespace dsp {
935 
ObmcInit_NEON()936 void ObmcInit_NEON() {}
937 
938 }  // namespace dsp
939 }  // namespace libgav1
940 #endif  // LIBGAV1_ENABLE_NEON
941