1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include "./vpx_config.h"
13 #include "vpx_ports/arm.h"
14 
vp8_loop_filter_neon(uint8x16_t qblimit,uint8x16_t qlimit,uint8x16_t qthresh,uint8x16_t q3,uint8x16_t q4,uint8x16_t q5,uint8x16_t q6,uint8x16_t q7,uint8x16_t q8,uint8x16_t q9,uint8x16_t q10,uint8x16_t * q5r,uint8x16_t * q6r,uint8x16_t * q7r,uint8x16_t * q8r)15 static INLINE void vp8_loop_filter_neon(
16         uint8x16_t qblimit,  // flimit
17         uint8x16_t qlimit,   // limit
18         uint8x16_t qthresh,  // thresh
19         uint8x16_t q3,       // p3
20         uint8x16_t q4,       // p2
21         uint8x16_t q5,       // p1
22         uint8x16_t q6,       // p0
23         uint8x16_t q7,       // q0
24         uint8x16_t q8,       // q1
25         uint8x16_t q9,       // q2
26         uint8x16_t q10,      // q3
27         uint8x16_t *q5r,     // p1
28         uint8x16_t *q6r,     // p0
29         uint8x16_t *q7r,     // q0
30         uint8x16_t *q8r) {   // q1
31     uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
32     int16x8_t q2s16, q11s16;
33     uint16x8_t q4u16;
34     int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
35     int8x8_t d2s8, d3s8;
36 
37     q11u8 = vabdq_u8(q3, q4);
38     q12u8 = vabdq_u8(q4, q5);
39     q13u8 = vabdq_u8(q5, q6);
40     q14u8 = vabdq_u8(q8, q7);
41     q3    = vabdq_u8(q9, q8);
42     q4    = vabdq_u8(q10, q9);
43 
44     q11u8 = vmaxq_u8(q11u8, q12u8);
45     q12u8 = vmaxq_u8(q13u8, q14u8);
46     q3    = vmaxq_u8(q3, q4);
47     q15u8 = vmaxq_u8(q11u8, q12u8);
48 
49     q9 = vabdq_u8(q6, q7);
50 
51     // vp8_hevmask
52     q13u8 = vcgtq_u8(q13u8, qthresh);
53     q14u8 = vcgtq_u8(q14u8, qthresh);
54     q15u8 = vmaxq_u8(q15u8, q3);
55 
56     q2u8 = vabdq_u8(q5, q8);
57     q9 = vqaddq_u8(q9, q9);
58 
59     q15u8 = vcgeq_u8(qlimit, q15u8);
60 
61     // vp8_filter() function
62     // convert to signed
63     q10 = vdupq_n_u8(0x80);
64     q8 = veorq_u8(q8, q10);
65     q7 = veorq_u8(q7, q10);
66     q6 = veorq_u8(q6, q10);
67     q5 = veorq_u8(q5, q10);
68 
69     q2u8 = vshrq_n_u8(q2u8, 1);
70     q9 = vqaddq_u8(q9, q2u8);
71 
72     q10 = vdupq_n_u8(3);
73 
74     q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
75                      vget_low_s8(vreinterpretq_s8_u8(q6)));
76     q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
77                       vget_high_s8(vreinterpretq_s8_u8(q6)));
78 
79     q9 = vcgeq_u8(qblimit, q9);
80 
81     q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
82                     vreinterpretq_s8_u8(q8));
83 
84     q14u8 = vorrq_u8(q13u8, q14u8);
85 
86     q4u16 = vmovl_u8(vget_low_u8(q10));
87     q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
88     q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
89 
90     q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
91     q15u8 = vandq_u8(q15u8, q9);
92 
93     q1s8 = vreinterpretq_s8_u8(q1u8);
94     q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
95     q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
96 
97     q9 = vdupq_n_u8(4);
98     // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
99     d2s8 = vqmovn_s16(q2s16);
100     d3s8 = vqmovn_s16(q11s16);
101     q1s8 = vcombine_s8(d2s8, d3s8);
102     q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
103     q1s8 = vreinterpretq_s8_u8(q1u8);
104 
105     q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
106     q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
107     q2s8 = vshrq_n_s8(q2s8, 3);
108     q1s8 = vshrq_n_s8(q1s8, 3);
109 
110     q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
111     q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
112 
113     q1s8 = vrshrq_n_s8(q1s8, 1);
114     q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
115 
116     q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
117     q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
118 
119     q0u8 = vdupq_n_u8(0x80);
120     *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
121     *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
122     *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
123     *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
124     return;
125 }
126 
vp8_loop_filter_horizontal_edge_y_neon(unsigned char * src,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh)127 void vp8_loop_filter_horizontal_edge_y_neon(
128         unsigned char *src,
129         int pitch,
130         unsigned char blimit,
131         unsigned char limit,
132         unsigned char thresh) {
133     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
134     uint8x16_t q5, q6, q7, q8, q9, q10;
135 
136     qblimit = vdupq_n_u8(blimit);
137     qlimit  = vdupq_n_u8(limit);
138     qthresh = vdupq_n_u8(thresh);
139     src -= (pitch << 2);
140 
141     q3 = vld1q_u8(src);
142     src += pitch;
143     q4 = vld1q_u8(src);
144     src += pitch;
145     q5 = vld1q_u8(src);
146     src += pitch;
147     q6 = vld1q_u8(src);
148     src += pitch;
149     q7 = vld1q_u8(src);
150     src += pitch;
151     q8 = vld1q_u8(src);
152     src += pitch;
153     q9 = vld1q_u8(src);
154     src += pitch;
155     q10 = vld1q_u8(src);
156 
157     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
158                          q5, q6, q7, q8, q9, q10,
159                          &q5, &q6, &q7, &q8);
160 
161     src -= (pitch * 5);
162     vst1q_u8(src, q5);
163     src += pitch;
164     vst1q_u8(src, q6);
165     src += pitch;
166     vst1q_u8(src, q7);
167     src += pitch;
168     vst1q_u8(src, q8);
169     return;
170 }
171 
vp8_loop_filter_horizontal_edge_uv_neon(unsigned char * u,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh,unsigned char * v)172 void vp8_loop_filter_horizontal_edge_uv_neon(
173         unsigned char *u,
174         int pitch,
175         unsigned char blimit,
176         unsigned char limit,
177         unsigned char thresh,
178         unsigned char *v) {
179     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
180     uint8x16_t q5, q6, q7, q8, q9, q10;
181     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
182     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
183 
184     qblimit = vdupq_n_u8(blimit);
185     qlimit  = vdupq_n_u8(limit);
186     qthresh = vdupq_n_u8(thresh);
187 
188     u -= (pitch << 2);
189     v -= (pitch << 2);
190 
191     d6  = vld1_u8(u);
192     u += pitch;
193     d7  = vld1_u8(v);
194     v += pitch;
195     d8  = vld1_u8(u);
196     u += pitch;
197     d9  = vld1_u8(v);
198     v += pitch;
199     d10 = vld1_u8(u);
200     u += pitch;
201     d11 = vld1_u8(v);
202     v += pitch;
203     d12 = vld1_u8(u);
204     u += pitch;
205     d13 = vld1_u8(v);
206     v += pitch;
207     d14 = vld1_u8(u);
208     u += pitch;
209     d15 = vld1_u8(v);
210     v += pitch;
211     d16 = vld1_u8(u);
212     u += pitch;
213     d17 = vld1_u8(v);
214     v += pitch;
215     d18 = vld1_u8(u);
216     u += pitch;
217     d19 = vld1_u8(v);
218     v += pitch;
219     d20 = vld1_u8(u);
220     d21 = vld1_u8(v);
221 
222     q3 = vcombine_u8(d6, d7);
223     q4 = vcombine_u8(d8, d9);
224     q5 = vcombine_u8(d10, d11);
225     q6 = vcombine_u8(d12, d13);
226     q7 = vcombine_u8(d14, d15);
227     q8 = vcombine_u8(d16, d17);
228     q9 = vcombine_u8(d18, d19);
229     q10 = vcombine_u8(d20, d21);
230 
231     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
232                          q5, q6, q7, q8, q9, q10,
233                          &q5, &q6, &q7, &q8);
234 
235     u -= (pitch * 5);
236     vst1_u8(u, vget_low_u8(q5));
237     u += pitch;
238     vst1_u8(u, vget_low_u8(q6));
239     u += pitch;
240     vst1_u8(u, vget_low_u8(q7));
241     u += pitch;
242     vst1_u8(u, vget_low_u8(q8));
243 
244     v -= (pitch * 5);
245     vst1_u8(v, vget_high_u8(q5));
246     v += pitch;
247     vst1_u8(v, vget_high_u8(q6));
248     v += pitch;
249     vst1_u8(v, vget_high_u8(q7));
250     v += pitch;
251     vst1_u8(v, vget_high_u8(q8));
252     return;
253 }
254 
write_4x8(unsigned char * dst,int pitch,const uint8x8x4_t result)255 static INLINE void write_4x8(unsigned char *dst, int pitch,
256                              const uint8x8x4_t result) {
257 #ifdef VPX_INCOMPATIBLE_GCC
258     /*
259      * uint8x8x4_t result
260     00 01 02 03 | 04 05 06 07
261     10 11 12 13 | 14 15 16 17
262     20 21 22 23 | 24 25 26 27
263     30 31 32 33 | 34 35 36 37
264     ---
265     * after vtrn_u16
266     00 01 20 21 | 04 05 24 25
267     02 03 22 23 | 06 07 26 27
268     10 11 30 31 | 14 15 34 35
269     12 13 32 33 | 16 17 36 37
270     ---
271     * after vtrn_u8
272     00 10 20 30 | 04 14 24 34
273     01 11 21 31 | 05 15 25 35
274     02 12 22 32 | 06 16 26 36
275     03 13 23 33 | 07 17 27 37
276     */
277     const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
278                                           vreinterpret_u16_u8(result.val[2]));
279     const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
280                                           vreinterpret_u16_u8(result.val[3]));
281     const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
282                                        vreinterpret_u8_u16(r13_u16.val[0]));
283     const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
284                                        vreinterpret_u8_u16(r13_u16.val[1]));
285     const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
286     const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
287     const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
288     const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
289     vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
290     dst += pitch;
291     vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
292     dst += pitch;
293     vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
294     dst += pitch;
295     vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
296     dst += pitch;
297     vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
298     dst += pitch;
299     vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
300     dst += pitch;
301     vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
302     dst += pitch;
303     vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
304 #else
305     vst4_lane_u8(dst, result, 0);
306     dst += pitch;
307     vst4_lane_u8(dst, result, 1);
308     dst += pitch;
309     vst4_lane_u8(dst, result, 2);
310     dst += pitch;
311     vst4_lane_u8(dst, result, 3);
312     dst += pitch;
313     vst4_lane_u8(dst, result, 4);
314     dst += pitch;
315     vst4_lane_u8(dst, result, 5);
316     dst += pitch;
317     vst4_lane_u8(dst, result, 6);
318     dst += pitch;
319     vst4_lane_u8(dst, result, 7);
320 #endif  // VPX_INCOMPATIBLE_GCC
321 }
322 
vp8_loop_filter_vertical_edge_y_neon(unsigned char * src,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh)323 void vp8_loop_filter_vertical_edge_y_neon(
324         unsigned char *src,
325         int pitch,
326         unsigned char blimit,
327         unsigned char limit,
328         unsigned char thresh) {
329     unsigned char *s, *d;
330     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
331     uint8x16_t q5, q6, q7, q8, q9, q10;
332     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
333     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
334     uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
335     uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
336     uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
337     uint8x8x4_t q4ResultH, q4ResultL;
338 
339     qblimit = vdupq_n_u8(blimit);
340     qlimit  = vdupq_n_u8(limit);
341     qthresh = vdupq_n_u8(thresh);
342 
343     s = src - 4;
344     d6  = vld1_u8(s);
345     s += pitch;
346     d8  = vld1_u8(s);
347     s += pitch;
348     d10 = vld1_u8(s);
349     s += pitch;
350     d12 = vld1_u8(s);
351     s += pitch;
352     d14 = vld1_u8(s);
353     s += pitch;
354     d16 = vld1_u8(s);
355     s += pitch;
356     d18 = vld1_u8(s);
357     s += pitch;
358     d20 = vld1_u8(s);
359     s += pitch;
360     d7  = vld1_u8(s);
361     s += pitch;
362     d9  = vld1_u8(s);
363     s += pitch;
364     d11 = vld1_u8(s);
365     s += pitch;
366     d13 = vld1_u8(s);
367     s += pitch;
368     d15 = vld1_u8(s);
369     s += pitch;
370     d17 = vld1_u8(s);
371     s += pitch;
372     d19 = vld1_u8(s);
373     s += pitch;
374     d21 = vld1_u8(s);
375 
376     q3 = vcombine_u8(d6, d7);
377     q4 = vcombine_u8(d8, d9);
378     q5 = vcombine_u8(d10, d11);
379     q6 = vcombine_u8(d12, d13);
380     q7 = vcombine_u8(d14, d15);
381     q8 = vcombine_u8(d16, d17);
382     q9 = vcombine_u8(d18, d19);
383     q10 = vcombine_u8(d20, d21);
384 
385     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
386     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
387     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
388     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
389 
390     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
391                        vreinterpretq_u16_u32(q2tmp2.val[0]));
392     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
393                        vreinterpretq_u16_u32(q2tmp3.val[0]));
394     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
395                        vreinterpretq_u16_u32(q2tmp2.val[1]));
396     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
397                        vreinterpretq_u16_u32(q2tmp3.val[1]));
398 
399     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
400                        vreinterpretq_u8_u16(q2tmp5.val[0]));
401     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
402                        vreinterpretq_u8_u16(q2tmp5.val[1]));
403     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
404                        vreinterpretq_u8_u16(q2tmp7.val[0]));
405     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
406                        vreinterpretq_u8_u16(q2tmp7.val[1]));
407 
408     q3 = q2tmp8.val[0];
409     q4 = q2tmp8.val[1];
410     q5 = q2tmp9.val[0];
411     q6 = q2tmp9.val[1];
412     q7 = q2tmp10.val[0];
413     q8 = q2tmp10.val[1];
414     q9 = q2tmp11.val[0];
415     q10 = q2tmp11.val[1];
416 
417     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
418                          q5, q6, q7, q8, q9, q10,
419                          &q5, &q6, &q7, &q8);
420 
421     q4ResultL.val[0] = vget_low_u8(q5);   // d10
422     q4ResultL.val[1] = vget_low_u8(q6);   // d12
423     q4ResultL.val[2] = vget_low_u8(q7);   // d14
424     q4ResultL.val[3] = vget_low_u8(q8);   // d16
425     q4ResultH.val[0] = vget_high_u8(q5);  // d11
426     q4ResultH.val[1] = vget_high_u8(q6);  // d13
427     q4ResultH.val[2] = vget_high_u8(q7);  // d15
428     q4ResultH.val[3] = vget_high_u8(q8);  // d17
429 
430     d = src - 2;
431     write_4x8(d, pitch, q4ResultL);
432     d += pitch * 8;
433     write_4x8(d, pitch, q4ResultH);
434 }
435 
vp8_loop_filter_vertical_edge_uv_neon(unsigned char * u,int pitch,unsigned char blimit,unsigned char limit,unsigned char thresh,unsigned char * v)436 void vp8_loop_filter_vertical_edge_uv_neon(
437         unsigned char *u,
438         int pitch,
439         unsigned char blimit,
440         unsigned char limit,
441         unsigned char thresh,
442         unsigned char *v) {
443     unsigned char *us, *ud;
444     unsigned char *vs, *vd;
445     uint8x16_t qblimit, qlimit, qthresh, q3, q4;
446     uint8x16_t q5, q6, q7, q8, q9, q10;
447     uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
448     uint8x8_t d15, d16, d17, d18, d19, d20, d21;
449     uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
450     uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
451     uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
452     uint8x8x4_t q4ResultH, q4ResultL;
453 
454     qblimit = vdupq_n_u8(blimit);
455     qlimit  = vdupq_n_u8(limit);
456     qthresh = vdupq_n_u8(thresh);
457 
458     us = u - 4;
459     d6 = vld1_u8(us);
460     us += pitch;
461     d8 = vld1_u8(us);
462     us += pitch;
463     d10 = vld1_u8(us);
464     us += pitch;
465     d12 = vld1_u8(us);
466     us += pitch;
467     d14 = vld1_u8(us);
468     us += pitch;
469     d16 = vld1_u8(us);
470     us += pitch;
471     d18 = vld1_u8(us);
472     us += pitch;
473     d20 = vld1_u8(us);
474 
475     vs = v - 4;
476     d7 = vld1_u8(vs);
477     vs += pitch;
478     d9 = vld1_u8(vs);
479     vs += pitch;
480     d11 = vld1_u8(vs);
481     vs += pitch;
482     d13 = vld1_u8(vs);
483     vs += pitch;
484     d15 = vld1_u8(vs);
485     vs += pitch;
486     d17 = vld1_u8(vs);
487     vs += pitch;
488     d19 = vld1_u8(vs);
489     vs += pitch;
490     d21 = vld1_u8(vs);
491 
492     q3 = vcombine_u8(d6, d7);
493     q4 = vcombine_u8(d8, d9);
494     q5 = vcombine_u8(d10, d11);
495     q6 = vcombine_u8(d12, d13);
496     q7 = vcombine_u8(d14, d15);
497     q8 = vcombine_u8(d16, d17);
498     q9 = vcombine_u8(d18, d19);
499     q10 = vcombine_u8(d20, d21);
500 
501     q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
502     q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
503     q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
504     q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
505 
506     q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
507                        vreinterpretq_u16_u32(q2tmp2.val[0]));
508     q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
509                        vreinterpretq_u16_u32(q2tmp3.val[0]));
510     q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
511                        vreinterpretq_u16_u32(q2tmp2.val[1]));
512     q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
513                        vreinterpretq_u16_u32(q2tmp3.val[1]));
514 
515     q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
516                        vreinterpretq_u8_u16(q2tmp5.val[0]));
517     q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
518                        vreinterpretq_u8_u16(q2tmp5.val[1]));
519     q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
520                        vreinterpretq_u8_u16(q2tmp7.val[0]));
521     q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
522                        vreinterpretq_u8_u16(q2tmp7.val[1]));
523 
524     q3 = q2tmp8.val[0];
525     q4 = q2tmp8.val[1];
526     q5 = q2tmp9.val[0];
527     q6 = q2tmp9.val[1];
528     q7 = q2tmp10.val[0];
529     q8 = q2tmp10.val[1];
530     q9 = q2tmp11.val[0];
531     q10 = q2tmp11.val[1];
532 
533     vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
534                          q5, q6, q7, q8, q9, q10,
535                          &q5, &q6, &q7, &q8);
536 
537     q4ResultL.val[0] = vget_low_u8(q5);   // d10
538     q4ResultL.val[1] = vget_low_u8(q6);   // d12
539     q4ResultL.val[2] = vget_low_u8(q7);   // d14
540     q4ResultL.val[3] = vget_low_u8(q8);   // d16
541     ud = u - 2;
542     write_4x8(ud, pitch, q4ResultL);
543 
544     q4ResultH.val[0] = vget_high_u8(q5);  // d11
545     q4ResultH.val[1] = vget_high_u8(q6);  // d13
546     q4ResultH.val[2] = vget_high_u8(q7);  // d15
547     q4ResultH.val[3] = vget_high_u8(q8);  // d17
548     vd = v - 2;
549     write_4x8(vd, pitch, q4ResultH);
550 }
551