1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_ports/mem.h"
19
load_4x4(const int16_t * s,ptrdiff_t p,int16x4_t * s0,int16x4_t * s1,int16x4_t * s2,int16x4_t * s3)20 static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
21 int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) {
22 *s0 = vld1_s16(s);
23 s += p;
24 *s1 = vld1_s16(s);
25 s += p;
26 *s2 = vld1_s16(s);
27 s += p;
28 *s3 = vld1_s16(s);
29 }
30
load_8x4(const uint16_t * s,ptrdiff_t p,uint16x8_t * s0,uint16x8_t * s1,uint16x8_t * s2,uint16x8_t * s3)31 static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
32 uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) {
33 *s0 = vld1q_u16(s);
34 s += p;
35 *s1 = vld1q_u16(s);
36 s += p;
37 *s2 = vld1q_u16(s);
38 s += p;
39 *s3 = vld1q_u16(s);
40 }
41
load_8x8(const int16_t * s,ptrdiff_t p,int16x8_t * s0,int16x8_t * s1,int16x8_t * s2,int16x8_t * s3,int16x8_t * s4,int16x8_t * s5,int16x8_t * s6,int16x8_t * s7)42 static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
43 int16x8_t *s1, int16x8_t *s2, int16x8_t *s3,
44 int16x8_t *s4, int16x8_t *s5, int16x8_t *s6,
45 int16x8_t *s7) {
46 *s0 = vld1q_s16(s);
47 s += p;
48 *s1 = vld1q_s16(s);
49 s += p;
50 *s2 = vld1q_s16(s);
51 s += p;
52 *s3 = vld1q_s16(s);
53 s += p;
54 *s4 = vld1q_s16(s);
55 s += p;
56 *s5 = vld1q_s16(s);
57 s += p;
58 *s6 = vld1q_s16(s);
59 s += p;
60 *s7 = vld1q_s16(s);
61 }
62
store_8x8(uint16_t * s,ptrdiff_t p,const uint16x8_t s0,const uint16x8_t s1,const uint16x8_t s2,const uint16x8_t s3,const uint16x8_t s4,const uint16x8_t s5,const uint16x8_t s6,const uint16x8_t s7)63 static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
64 const uint16x8_t s1, const uint16x8_t s2,
65 const uint16x8_t s3, const uint16x8_t s4,
66 const uint16x8_t s5, const uint16x8_t s6,
67 const uint16x8_t s7) {
68 vst1q_u16(s, s0);
69 s += p;
70 vst1q_u16(s, s1);
71 s += p;
72 vst1q_u16(s, s2);
73 s += p;
74 vst1q_u16(s, s3);
75 s += p;
76 vst1q_u16(s, s4);
77 s += p;
78 vst1q_u16(s, s5);
79 s += p;
80 vst1q_u16(s, s6);
81 s += p;
82 vst1q_u16(s, s7);
83 }
84
convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters)85 static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
86 const int16x4_t s2, const int16x4_t s3,
87 const int16x4_t s4, const int16x4_t s5,
88 const int16x4_t s6, const int16x4_t s7,
89 const int16x8_t filters) {
90 const int16x4_t filters_lo = vget_low_s16(filters);
91 const int16x4_t filters_hi = vget_high_s16(filters);
92 int32x4_t sum = vdupq_n_s32(0);
93
94 sum = vmlal_lane_s16(sum, s0, filters_lo, 0);
95 sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
96 sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
97 sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
98 sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
99 sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
100 sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
101 sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
102 return sum;
103 }
104
convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const uint16x8_t max)105 static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
106 const int16x8_t s2, const int16x8_t s3,
107 const int16x8_t s4, const int16x8_t s5,
108 const int16x8_t s6, const int16x8_t s7,
109 const int16x8_t filters,
110 const uint16x8_t max) {
111 const int16x4_t filters_lo = vget_low_s16(filters);
112 const int16x4_t filters_hi = vget_high_s16(filters);
113 int32x4_t sum0 = vdupq_n_s32(0);
114 int32x4_t sum1 = vdupq_n_s32(0);
115 uint16x8_t d;
116
117 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0);
118 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
119 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
120 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
121 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
122 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
123 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
124 sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
125 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0);
126 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
127 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
128 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
129 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
130 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
131 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
132 sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
133 d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
134 d = vminq_u16(d, max);
135 return d;
136 }
137
vpx_highbd_convolve8_horiz_neon(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)138 void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
139 uint8_t *dst8, ptrdiff_t dst_stride,
140 const int16_t *filter_x, int x_step_q4,
141 const int16_t *filter_y, // unused
142 int y_step_q4, // unused
143 int w, int h, int bd) {
144 if (x_step_q4 != 16) {
145 vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
146 x_step_q4, filter_y, y_step_q4, w, h, bd);
147 } else {
148 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
149 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
150 const int16x8_t filters = vld1q_s16(filter_x);
151 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
152 uint16x8_t t0, t1, t2, t3;
153
154 assert(!((intptr_t)dst & 3));
155 assert(!(dst_stride & 3));
156
157 src -= 3;
158
159 if (h == 4) {
160 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
161 int32x4_t d0, d1, d2, d3;
162 uint16x8_t d01, d23;
163
164 __builtin_prefetch(src + 0 * src_stride);
165 __builtin_prefetch(src + 1 * src_stride);
166 __builtin_prefetch(src + 2 * src_stride);
167 __builtin_prefetch(src + 3 * src_stride);
168 load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
169 transpose_u16_8x4(&t0, &t1, &t2, &t3);
170 s0 = vreinterpret_s16_u16(vget_low_u16(t0));
171 s1 = vreinterpret_s16_u16(vget_low_u16(t1));
172 s2 = vreinterpret_s16_u16(vget_low_u16(t2));
173 s3 = vreinterpret_s16_u16(vget_low_u16(t3));
174 s4 = vreinterpret_s16_u16(vget_high_u16(t0));
175 s5 = vreinterpret_s16_u16(vget_high_u16(t1));
176 s6 = vreinterpret_s16_u16(vget_high_u16(t2));
177 __builtin_prefetch(dst + 0 * dst_stride);
178 __builtin_prefetch(dst + 1 * dst_stride);
179 __builtin_prefetch(dst + 2 * dst_stride);
180 __builtin_prefetch(dst + 3 * dst_stride);
181 src += 7;
182
183 do {
184 load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
185 transpose_s16_4x4d(&s7, &s8, &s9, &s10);
186
187 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
188 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
189 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
190 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
191
192 d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
193 d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
194 d01 = vminq_u16(d01, max);
195 d23 = vminq_u16(d23, max);
196 transpose_u16_4x4q(&d01, &d23);
197
198 vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
199 vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
200 vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
201 vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
202
203 s0 = s4;
204 s1 = s5;
205 s2 = s6;
206 s3 = s7;
207 s4 = s8;
208 s5 = s9;
209 s6 = s10;
210 src += 4;
211 dst += 4;
212 w -= 4;
213 } while (w > 0);
214 } else {
215 int16x8_t t4, t5, t6, t7;
216 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
217 uint16x8_t d0, d1, d2, d3;
218
219 if (w == 4) {
220 do {
221 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
222 &s5, &s6, &s7);
223 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
224
225 load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
226 &t4, &t5, &t6, &t7);
227 src += 8 * src_stride;
228 __builtin_prefetch(dst + 0 * dst_stride);
229 __builtin_prefetch(dst + 1 * dst_stride);
230 __builtin_prefetch(dst + 2 * dst_stride);
231 __builtin_prefetch(dst + 3 * dst_stride);
232 __builtin_prefetch(dst + 4 * dst_stride);
233 __builtin_prefetch(dst + 5 * dst_stride);
234 __builtin_prefetch(dst + 6 * dst_stride);
235 __builtin_prefetch(dst + 7 * dst_stride);
236 transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
237
238 __builtin_prefetch(src + 0 * src_stride);
239 __builtin_prefetch(src + 1 * src_stride);
240 __builtin_prefetch(src + 2 * src_stride);
241 __builtin_prefetch(src + 3 * src_stride);
242 __builtin_prefetch(src + 4 * src_stride);
243 __builtin_prefetch(src + 5 * src_stride);
244 __builtin_prefetch(src + 6 * src_stride);
245 __builtin_prefetch(src + 7 * src_stride);
246 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
247 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
248 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
249 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
250
251 transpose_u16_8x4(&d0, &d1, &d2, &d3);
252 vst1_u16(dst, vget_low_u16(d0));
253 dst += dst_stride;
254 vst1_u16(dst, vget_low_u16(d1));
255 dst += dst_stride;
256 vst1_u16(dst, vget_low_u16(d2));
257 dst += dst_stride;
258 vst1_u16(dst, vget_low_u16(d3));
259 dst += dst_stride;
260 vst1_u16(dst, vget_high_u16(d0));
261 dst += dst_stride;
262 vst1_u16(dst, vget_high_u16(d1));
263 dst += dst_stride;
264 vst1_u16(dst, vget_high_u16(d2));
265 dst += dst_stride;
266 vst1_u16(dst, vget_high_u16(d3));
267 dst += dst_stride;
268 h -= 8;
269 } while (h > 0);
270 } else {
271 int width;
272 const uint16_t *s;
273 uint16_t *d;
274 int16x8_t s11, s12, s13, s14;
275 uint16x8_t d4, d5, d6, d7;
276
277 do {
278 __builtin_prefetch(src + 0 * src_stride);
279 __builtin_prefetch(src + 1 * src_stride);
280 __builtin_prefetch(src + 2 * src_stride);
281 __builtin_prefetch(src + 3 * src_stride);
282 __builtin_prefetch(src + 4 * src_stride);
283 __builtin_prefetch(src + 5 * src_stride);
284 __builtin_prefetch(src + 6 * src_stride);
285 __builtin_prefetch(src + 7 * src_stride);
286 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
287 &s5, &s6, &s7);
288 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
289
290 width = w;
291 s = src + 7;
292 d = dst;
293 __builtin_prefetch(dst + 0 * dst_stride);
294 __builtin_prefetch(dst + 1 * dst_stride);
295 __builtin_prefetch(dst + 2 * dst_stride);
296 __builtin_prefetch(dst + 3 * dst_stride);
297 __builtin_prefetch(dst + 4 * dst_stride);
298 __builtin_prefetch(dst + 5 * dst_stride);
299 __builtin_prefetch(dst + 6 * dst_stride);
300 __builtin_prefetch(dst + 7 * dst_stride);
301
302 do {
303 load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
304 &s12, &s13, &s14);
305 transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
306
307 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
308 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
309 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
310 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
311 d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
312 d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
313 d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
314 d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
315
316 transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
317 store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
318
319 s0 = s8;
320 s1 = s9;
321 s2 = s10;
322 s3 = s11;
323 s4 = s12;
324 s5 = s13;
325 s6 = s14;
326 s += 8;
327 d += 8;
328 width -= 8;
329 } while (width > 0);
330 src += 8 * src_stride;
331 dst += 8 * dst_stride;
332 h -= 8;
333 } while (h > 0);
334 }
335 }
336 }
337 }
338
vpx_highbd_convolve8_avg_horiz_neon(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)339 void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8,
340 ptrdiff_t src_stride, uint8_t *dst8,
341 ptrdiff_t dst_stride,
342 const int16_t *filter_x, int x_step_q4,
343 const int16_t *filter_y, // unused
344 int y_step_q4, // unused
345 int w, int h, int bd) {
346 if (x_step_q4 != 16) {
347 vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride,
348 filter_x, x_step_q4, filter_y, y_step_q4,
349 w, h, bd);
350 } else {
351 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
352 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
353 const int16x8_t filters = vld1q_s16(filter_x);
354 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
355 uint16x8_t t0, t1, t2, t3;
356
357 assert(!((intptr_t)dst & 3));
358 assert(!(dst_stride & 3));
359
360 src -= 3;
361
362 if (h == 4) {
363 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
364 int32x4_t d0, d1, d2, d3;
365 uint16x8_t d01, d23, t01, t23;
366
367 __builtin_prefetch(src + 0 * src_stride);
368 __builtin_prefetch(src + 1 * src_stride);
369 __builtin_prefetch(src + 2 * src_stride);
370 __builtin_prefetch(src + 3 * src_stride);
371 load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
372 transpose_u16_8x4(&t0, &t1, &t2, &t3);
373 s0 = vreinterpret_s16_u16(vget_low_u16(t0));
374 s1 = vreinterpret_s16_u16(vget_low_u16(t1));
375 s2 = vreinterpret_s16_u16(vget_low_u16(t2));
376 s3 = vreinterpret_s16_u16(vget_low_u16(t3));
377 s4 = vreinterpret_s16_u16(vget_high_u16(t0));
378 s5 = vreinterpret_s16_u16(vget_high_u16(t1));
379 s6 = vreinterpret_s16_u16(vget_high_u16(t2));
380 __builtin_prefetch(dst + 0 * dst_stride);
381 __builtin_prefetch(dst + 1 * dst_stride);
382 __builtin_prefetch(dst + 2 * dst_stride);
383 __builtin_prefetch(dst + 3 * dst_stride);
384 src += 7;
385
386 do {
387 load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
388 transpose_s16_4x4d(&s7, &s8, &s9, &s10);
389
390 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
391 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
392 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
393 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
394
395 t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
396 t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
397 t01 = vminq_u16(t01, max);
398 t23 = vminq_u16(t23, max);
399 transpose_u16_4x4q(&t01, &t23);
400
401 d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
402 vld1_u16(dst + 2 * dst_stride));
403 d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
404 vld1_u16(dst + 3 * dst_stride));
405 d01 = vrhaddq_u16(d01, t01);
406 d23 = vrhaddq_u16(d23, t23);
407
408 vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
409 vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
410 vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
411 vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
412
413 s0 = s4;
414 s1 = s5;
415 s2 = s6;
416 s3 = s7;
417 s4 = s8;
418 s5 = s9;
419 s6 = s10;
420 src += 4;
421 dst += 4;
422 w -= 4;
423 } while (w > 0);
424 } else {
425 int16x8_t t4, t5, t6, t7;
426 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
427 uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
428
429 if (w == 4) {
430 do {
431 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
432 &s5, &s6, &s7);
433 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
434
435 load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
436 &t4, &t5, &t6, &t7);
437 src += 8 * src_stride;
438 __builtin_prefetch(dst + 0 * dst_stride);
439 __builtin_prefetch(dst + 1 * dst_stride);
440 __builtin_prefetch(dst + 2 * dst_stride);
441 __builtin_prefetch(dst + 3 * dst_stride);
442 __builtin_prefetch(dst + 4 * dst_stride);
443 __builtin_prefetch(dst + 5 * dst_stride);
444 __builtin_prefetch(dst + 6 * dst_stride);
445 __builtin_prefetch(dst + 7 * dst_stride);
446 transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
447
448 __builtin_prefetch(src + 0 * src_stride);
449 __builtin_prefetch(src + 1 * src_stride);
450 __builtin_prefetch(src + 2 * src_stride);
451 __builtin_prefetch(src + 3 * src_stride);
452 __builtin_prefetch(src + 4 * src_stride);
453 __builtin_prefetch(src + 5 * src_stride);
454 __builtin_prefetch(src + 6 * src_stride);
455 __builtin_prefetch(src + 7 * src_stride);
456 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
457 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
458 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
459 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
460 transpose_u16_8x4(&t0, &t1, &t2, &t3);
461
462 d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
463 vld1_u16(dst + 4 * dst_stride));
464 d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
465 vld1_u16(dst + 5 * dst_stride));
466 d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
467 vld1_u16(dst + 6 * dst_stride));
468 d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
469 vld1_u16(dst + 7 * dst_stride));
470 d0 = vrhaddq_u16(d0, t0);
471 d1 = vrhaddq_u16(d1, t1);
472 d2 = vrhaddq_u16(d2, t2);
473 d3 = vrhaddq_u16(d3, t3);
474
475 vst1_u16(dst, vget_low_u16(d0));
476 dst += dst_stride;
477 vst1_u16(dst, vget_low_u16(d1));
478 dst += dst_stride;
479 vst1_u16(dst, vget_low_u16(d2));
480 dst += dst_stride;
481 vst1_u16(dst, vget_low_u16(d3));
482 dst += dst_stride;
483 vst1_u16(dst, vget_high_u16(d0));
484 dst += dst_stride;
485 vst1_u16(dst, vget_high_u16(d1));
486 dst += dst_stride;
487 vst1_u16(dst, vget_high_u16(d2));
488 dst += dst_stride;
489 vst1_u16(dst, vget_high_u16(d3));
490 dst += dst_stride;
491 h -= 8;
492 } while (h > 0);
493 } else {
494 int width;
495 const uint16_t *s;
496 uint16_t *d;
497 int16x8_t s11, s12, s13, s14;
498 uint16x8_t d4, d5, d6, d7;
499
500 do {
501 __builtin_prefetch(src + 0 * src_stride);
502 __builtin_prefetch(src + 1 * src_stride);
503 __builtin_prefetch(src + 2 * src_stride);
504 __builtin_prefetch(src + 3 * src_stride);
505 __builtin_prefetch(src + 4 * src_stride);
506 __builtin_prefetch(src + 5 * src_stride);
507 __builtin_prefetch(src + 6 * src_stride);
508 __builtin_prefetch(src + 7 * src_stride);
509 load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
510 &s5, &s6, &s7);
511 transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
512
513 width = w;
514 s = src + 7;
515 d = dst;
516 __builtin_prefetch(dst + 0 * dst_stride);
517 __builtin_prefetch(dst + 1 * dst_stride);
518 __builtin_prefetch(dst + 2 * dst_stride);
519 __builtin_prefetch(dst + 3 * dst_stride);
520 __builtin_prefetch(dst + 4 * dst_stride);
521 __builtin_prefetch(dst + 5 * dst_stride);
522 __builtin_prefetch(dst + 6 * dst_stride);
523 __builtin_prefetch(dst + 7 * dst_stride);
524
525 do {
526 load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
527 &s12, &s13, &s14);
528 transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
529
530 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
531 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
532 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
533 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
534 d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
535 d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
536 d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
537 d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
538
539 transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
540
541 d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
542 d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
543 d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
544 d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
545 d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
546 d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
547 d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
548 d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
549
550 store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
551
552 s0 = s8;
553 s1 = s9;
554 s2 = s10;
555 s3 = s11;
556 s4 = s12;
557 s5 = s13;
558 s6 = s14;
559 s += 8;
560 d += 8;
561 width -= 8;
562 } while (width > 0);
563 src += 8 * src_stride;
564 dst += 8 * dst_stride;
565 h -= 8;
566 } while (h > 0);
567 }
568 }
569 }
570 }
571
vpx_highbd_convolve8_vert_neon(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)572 void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
573 uint8_t *dst8, ptrdiff_t dst_stride,
574 const int16_t *filter_x, // unused
575 int x_step_q4, // unused
576 const int16_t *filter_y, int y_step_q4,
577 int w, int h, int bd) {
578 if (y_step_q4 != 16) {
579 vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
580 x_step_q4, filter_y, y_step_q4, w, h, bd);
581 } else {
582 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
583 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
584 const int16x8_t filters = vld1q_s16(filter_y);
585 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
586
587 assert(!((intptr_t)dst & 3));
588 assert(!(dst_stride & 3));
589
590 src -= 3 * src_stride;
591
592 if (w == 4) {
593 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
594 int32x4_t d0, d1, d2, d3;
595 uint16x8_t d01, d23;
596
597 s0 = vreinterpret_s16_u16(vld1_u16(src));
598 src += src_stride;
599 s1 = vreinterpret_s16_u16(vld1_u16(src));
600 src += src_stride;
601 s2 = vreinterpret_s16_u16(vld1_u16(src));
602 src += src_stride;
603 s3 = vreinterpret_s16_u16(vld1_u16(src));
604 src += src_stride;
605 s4 = vreinterpret_s16_u16(vld1_u16(src));
606 src += src_stride;
607 s5 = vreinterpret_s16_u16(vld1_u16(src));
608 src += src_stride;
609 s6 = vreinterpret_s16_u16(vld1_u16(src));
610 src += src_stride;
611
612 do {
613 s7 = vreinterpret_s16_u16(vld1_u16(src));
614 src += src_stride;
615 s8 = vreinterpret_s16_u16(vld1_u16(src));
616 src += src_stride;
617 s9 = vreinterpret_s16_u16(vld1_u16(src));
618 src += src_stride;
619 s10 = vreinterpret_s16_u16(vld1_u16(src));
620 src += src_stride;
621
622 __builtin_prefetch(dst + 0 * dst_stride);
623 __builtin_prefetch(dst + 1 * dst_stride);
624 __builtin_prefetch(dst + 2 * dst_stride);
625 __builtin_prefetch(dst + 3 * dst_stride);
626 __builtin_prefetch(src + 0 * src_stride);
627 __builtin_prefetch(src + 1 * src_stride);
628 __builtin_prefetch(src + 2 * src_stride);
629 __builtin_prefetch(src + 3 * src_stride);
630 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
631 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
632 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
633 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
634
635 d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
636 d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
637 d01 = vminq_u16(d01, max);
638 d23 = vminq_u16(d23, max);
639 vst1_u16(dst, vget_low_u16(d01));
640 dst += dst_stride;
641 vst1_u16(dst, vget_high_u16(d01));
642 dst += dst_stride;
643 vst1_u16(dst, vget_low_u16(d23));
644 dst += dst_stride;
645 vst1_u16(dst, vget_high_u16(d23));
646 dst += dst_stride;
647
648 s0 = s4;
649 s1 = s5;
650 s2 = s6;
651 s3 = s7;
652 s4 = s8;
653 s5 = s9;
654 s6 = s10;
655 h -= 4;
656 } while (h > 0);
657 } else {
658 int height;
659 const uint16_t *s;
660 uint16_t *d;
661 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
662 uint16x8_t d0, d1, d2, d3;
663
664 do {
665 __builtin_prefetch(src + 0 * src_stride);
666 __builtin_prefetch(src + 1 * src_stride);
667 __builtin_prefetch(src + 2 * src_stride);
668 __builtin_prefetch(src + 3 * src_stride);
669 __builtin_prefetch(src + 4 * src_stride);
670 __builtin_prefetch(src + 5 * src_stride);
671 __builtin_prefetch(src + 6 * src_stride);
672 s = src;
673 s0 = vreinterpretq_s16_u16(vld1q_u16(s));
674 s += src_stride;
675 s1 = vreinterpretq_s16_u16(vld1q_u16(s));
676 s += src_stride;
677 s2 = vreinterpretq_s16_u16(vld1q_u16(s));
678 s += src_stride;
679 s3 = vreinterpretq_s16_u16(vld1q_u16(s));
680 s += src_stride;
681 s4 = vreinterpretq_s16_u16(vld1q_u16(s));
682 s += src_stride;
683 s5 = vreinterpretq_s16_u16(vld1q_u16(s));
684 s += src_stride;
685 s6 = vreinterpretq_s16_u16(vld1q_u16(s));
686 s += src_stride;
687 d = dst;
688 height = h;
689
690 do {
691 s7 = vreinterpretq_s16_u16(vld1q_u16(s));
692 s += src_stride;
693 s8 = vreinterpretq_s16_u16(vld1q_u16(s));
694 s += src_stride;
695 s9 = vreinterpretq_s16_u16(vld1q_u16(s));
696 s += src_stride;
697 s10 = vreinterpretq_s16_u16(vld1q_u16(s));
698 s += src_stride;
699
700 __builtin_prefetch(d + 0 * dst_stride);
701 __builtin_prefetch(d + 1 * dst_stride);
702 __builtin_prefetch(d + 2 * dst_stride);
703 __builtin_prefetch(d + 3 * dst_stride);
704 __builtin_prefetch(s + 0 * src_stride);
705 __builtin_prefetch(s + 1 * src_stride);
706 __builtin_prefetch(s + 2 * src_stride);
707 __builtin_prefetch(s + 3 * src_stride);
708 d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
709 d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
710 d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
711 d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
712
713 vst1q_u16(d, d0);
714 d += dst_stride;
715 vst1q_u16(d, d1);
716 d += dst_stride;
717 vst1q_u16(d, d2);
718 d += dst_stride;
719 vst1q_u16(d, d3);
720 d += dst_stride;
721
722 s0 = s4;
723 s1 = s5;
724 s2 = s6;
725 s3 = s7;
726 s4 = s8;
727 s5 = s9;
728 s6 = s10;
729 height -= 4;
730 } while (height > 0);
731 src += 8;
732 dst += 8;
733 w -= 8;
734 } while (w > 0);
735 }
736 }
737 }
738
vpx_highbd_convolve8_avg_vert_neon(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)739 void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8,
740 ptrdiff_t src_stride, uint8_t *dst8,
741 ptrdiff_t dst_stride,
742 const int16_t *filter_x, // unused
743 int x_step_q4, // unused
744 const int16_t *filter_y, int y_step_q4,
745 int w, int h, int bd) {
746 if (y_step_q4 != 16) {
747 vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride,
748 filter_x, x_step_q4, filter_y, y_step_q4, w,
749 h, bd);
750 } else {
751 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
752 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
753 const int16x8_t filters = vld1q_s16(filter_y);
754 const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
755
756 assert(!((intptr_t)dst & 3));
757 assert(!(dst_stride & 3));
758
759 src -= 3 * src_stride;
760
761 if (w == 4) {
762 int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
763 int32x4_t d0, d1, d2, d3;
764 uint16x8_t d01, d23, t01, t23;
765
766 s0 = vreinterpret_s16_u16(vld1_u16(src));
767 src += src_stride;
768 s1 = vreinterpret_s16_u16(vld1_u16(src));
769 src += src_stride;
770 s2 = vreinterpret_s16_u16(vld1_u16(src));
771 src += src_stride;
772 s3 = vreinterpret_s16_u16(vld1_u16(src));
773 src += src_stride;
774 s4 = vreinterpret_s16_u16(vld1_u16(src));
775 src += src_stride;
776 s5 = vreinterpret_s16_u16(vld1_u16(src));
777 src += src_stride;
778 s6 = vreinterpret_s16_u16(vld1_u16(src));
779 src += src_stride;
780
781 do {
782 s7 = vreinterpret_s16_u16(vld1_u16(src));
783 src += src_stride;
784 s8 = vreinterpret_s16_u16(vld1_u16(src));
785 src += src_stride;
786 s9 = vreinterpret_s16_u16(vld1_u16(src));
787 src += src_stride;
788 s10 = vreinterpret_s16_u16(vld1_u16(src));
789 src += src_stride;
790
791 __builtin_prefetch(dst + 0 * dst_stride);
792 __builtin_prefetch(dst + 1 * dst_stride);
793 __builtin_prefetch(dst + 2 * dst_stride);
794 __builtin_prefetch(dst + 3 * dst_stride);
795 __builtin_prefetch(src + 0 * src_stride);
796 __builtin_prefetch(src + 1 * src_stride);
797 __builtin_prefetch(src + 2 * src_stride);
798 __builtin_prefetch(src + 3 * src_stride);
799 d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
800 d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
801 d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
802 d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
803
804 t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
805 t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
806 t01 = vminq_u16(t01, max);
807 t23 = vminq_u16(t23, max);
808
809 d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
810 vld1_u16(dst + 1 * dst_stride));
811 d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
812 vld1_u16(dst + 3 * dst_stride));
813 d01 = vrhaddq_u16(d01, t01);
814 d23 = vrhaddq_u16(d23, t23);
815
816 vst1_u16(dst, vget_low_u16(d01));
817 dst += dst_stride;
818 vst1_u16(dst, vget_high_u16(d01));
819 dst += dst_stride;
820 vst1_u16(dst, vget_low_u16(d23));
821 dst += dst_stride;
822 vst1_u16(dst, vget_high_u16(d23));
823 dst += dst_stride;
824
825 s0 = s4;
826 s1 = s5;
827 s2 = s6;
828 s3 = s7;
829 s4 = s8;
830 s5 = s9;
831 s6 = s10;
832 h -= 4;
833 } while (h > 0);
834 } else {
835 int height;
836 const uint16_t *s;
837 uint16_t *d;
838 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
839 uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
840
841 do {
842 __builtin_prefetch(src + 0 * src_stride);
843 __builtin_prefetch(src + 1 * src_stride);
844 __builtin_prefetch(src + 2 * src_stride);
845 __builtin_prefetch(src + 3 * src_stride);
846 __builtin_prefetch(src + 4 * src_stride);
847 __builtin_prefetch(src + 5 * src_stride);
848 __builtin_prefetch(src + 6 * src_stride);
849 s = src;
850 s0 = vreinterpretq_s16_u16(vld1q_u16(s));
851 s += src_stride;
852 s1 = vreinterpretq_s16_u16(vld1q_u16(s));
853 s += src_stride;
854 s2 = vreinterpretq_s16_u16(vld1q_u16(s));
855 s += src_stride;
856 s3 = vreinterpretq_s16_u16(vld1q_u16(s));
857 s += src_stride;
858 s4 = vreinterpretq_s16_u16(vld1q_u16(s));
859 s += src_stride;
860 s5 = vreinterpretq_s16_u16(vld1q_u16(s));
861 s += src_stride;
862 s6 = vreinterpretq_s16_u16(vld1q_u16(s));
863 s += src_stride;
864 d = dst;
865 height = h;
866
867 do {
868 s7 = vreinterpretq_s16_u16(vld1q_u16(s));
869 s += src_stride;
870 s8 = vreinterpretq_s16_u16(vld1q_u16(s));
871 s += src_stride;
872 s9 = vreinterpretq_s16_u16(vld1q_u16(s));
873 s += src_stride;
874 s10 = vreinterpretq_s16_u16(vld1q_u16(s));
875 s += src_stride;
876
877 __builtin_prefetch(d + 0 * dst_stride);
878 __builtin_prefetch(d + 1 * dst_stride);
879 __builtin_prefetch(d + 2 * dst_stride);
880 __builtin_prefetch(d + 3 * dst_stride);
881 __builtin_prefetch(s + 0 * src_stride);
882 __builtin_prefetch(s + 1 * src_stride);
883 __builtin_prefetch(s + 2 * src_stride);
884 __builtin_prefetch(s + 3 * src_stride);
885 t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
886 t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
887 t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
888 t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
889
890 d0 = vld1q_u16(d + 0 * dst_stride);
891 d1 = vld1q_u16(d + 1 * dst_stride);
892 d2 = vld1q_u16(d + 2 * dst_stride);
893 d3 = vld1q_u16(d + 3 * dst_stride);
894 d0 = vrhaddq_u16(d0, t0);
895 d1 = vrhaddq_u16(d1, t1);
896 d2 = vrhaddq_u16(d2, t2);
897 d3 = vrhaddq_u16(d3, t3);
898
899 vst1q_u16(d, d0);
900 d += dst_stride;
901 vst1q_u16(d, d1);
902 d += dst_stride;
903 vst1q_u16(d, d2);
904 d += dst_stride;
905 vst1q_u16(d, d3);
906 d += dst_stride;
907
908 s0 = s4;
909 s1 = s5;
910 s2 = s6;
911 s3 = s7;
912 s4 = s8;
913 s5 = s9;
914 s6 = s10;
915 height -= 4;
916 } while (height > 0);
917 src += 8;
918 dst += 8;
919 w -= 8;
920 } while (w > 0);
921 }
922 }
923 }
924