1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15
vpx_highbd_convolve_avg_neon(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)16 void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride,
17 uint8_t *dst8, ptrdiff_t dst_stride,
18 const int16_t *filter_x, int filter_x_stride,
19 const int16_t *filter_y, int filter_y_stride,
20 int w, int h, int bd) {
21 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
22 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
23
24 (void)filter_x;
25 (void)filter_x_stride;
26 (void)filter_y;
27 (void)filter_y_stride;
28 (void)bd;
29
30 if (w < 8) { // avg4
31 uint16x4_t s0, s1, d0, d1;
32 uint16x8_t s01, d01;
33 do {
34 s0 = vld1_u16(src);
35 d0 = vld1_u16(dst);
36 src += src_stride;
37 s1 = vld1_u16(src);
38 d1 = vld1_u16(dst + dst_stride);
39 src += src_stride;
40 s01 = vcombine_u16(s0, s1);
41 d01 = vcombine_u16(d0, d1);
42 d01 = vrhaddq_u16(s01, d01);
43 vst1_u16(dst, vget_low_u16(d01));
44 dst += dst_stride;
45 vst1_u16(dst, vget_high_u16(d01));
46 dst += dst_stride;
47 h -= 2;
48 } while (h > 0);
49 } else if (w == 8) { // avg8
50 uint16x8_t s0, s1, d0, d1;
51 do {
52 s0 = vld1q_u16(src);
53 d0 = vld1q_u16(dst);
54 src += src_stride;
55 s1 = vld1q_u16(src);
56 d1 = vld1q_u16(dst + dst_stride);
57 src += src_stride;
58
59 d0 = vrhaddq_u16(s0, d0);
60 d1 = vrhaddq_u16(s1, d1);
61
62 vst1q_u16(dst, d0);
63 dst += dst_stride;
64 vst1q_u16(dst, d1);
65 dst += dst_stride;
66 h -= 2;
67 } while (h > 0);
68 } else if (w < 32) { // avg16
69 uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h;
70 do {
71 s0l = vld1q_u16(src);
72 s0h = vld1q_u16(src + 8);
73 d0l = vld1q_u16(dst);
74 d0h = vld1q_u16(dst + 8);
75 src += src_stride;
76 s1l = vld1q_u16(src);
77 s1h = vld1q_u16(src + 8);
78 d1l = vld1q_u16(dst + dst_stride);
79 d1h = vld1q_u16(dst + dst_stride + 8);
80 src += src_stride;
81
82 d0l = vrhaddq_u16(s0l, d0l);
83 d0h = vrhaddq_u16(s0h, d0h);
84 d1l = vrhaddq_u16(s1l, d1l);
85 d1h = vrhaddq_u16(s1h, d1h);
86
87 vst1q_u16(dst, d0l);
88 vst1q_u16(dst + 8, d0h);
89 dst += dst_stride;
90 vst1q_u16(dst, d1l);
91 vst1q_u16(dst + 8, d1h);
92 dst += dst_stride;
93 h -= 2;
94 } while (h > 0);
95 } else if (w == 32) { // avg32
96 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
97 do {
98 s0 = vld1q_u16(src);
99 s1 = vld1q_u16(src + 8);
100 s2 = vld1q_u16(src + 16);
101 s3 = vld1q_u16(src + 24);
102 d0 = vld1q_u16(dst);
103 d1 = vld1q_u16(dst + 8);
104 d2 = vld1q_u16(dst + 16);
105 d3 = vld1q_u16(dst + 24);
106 src += src_stride;
107
108 d0 = vrhaddq_u16(s0, d0);
109 d1 = vrhaddq_u16(s1, d1);
110 d2 = vrhaddq_u16(s2, d2);
111 d3 = vrhaddq_u16(s3, d3);
112
113 vst1q_u16(dst, d0);
114 vst1q_u16(dst + 8, d1);
115 vst1q_u16(dst + 16, d2);
116 vst1q_u16(dst + 24, d3);
117 dst += dst_stride;
118
119 s0 = vld1q_u16(src);
120 s1 = vld1q_u16(src + 8);
121 s2 = vld1q_u16(src + 16);
122 s3 = vld1q_u16(src + 24);
123 d0 = vld1q_u16(dst);
124 d1 = vld1q_u16(dst + 8);
125 d2 = vld1q_u16(dst + 16);
126 d3 = vld1q_u16(dst + 24);
127 src += src_stride;
128
129 d0 = vrhaddq_u16(s0, d0);
130 d1 = vrhaddq_u16(s1, d1);
131 d2 = vrhaddq_u16(s2, d2);
132 d3 = vrhaddq_u16(s3, d3);
133
134 vst1q_u16(dst, d0);
135 vst1q_u16(dst + 8, d1);
136 vst1q_u16(dst + 16, d2);
137 vst1q_u16(dst + 24, d3);
138 dst += dst_stride;
139 h -= 2;
140 } while (h > 0);
141 } else { // avg64
142 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3;
143 do {
144 s0 = vld1q_u16(src);
145 s1 = vld1q_u16(src + 8);
146 s2 = vld1q_u16(src + 16);
147 s3 = vld1q_u16(src + 24);
148 d0 = vld1q_u16(dst);
149 d1 = vld1q_u16(dst + 8);
150 d2 = vld1q_u16(dst + 16);
151 d3 = vld1q_u16(dst + 24);
152
153 d0 = vrhaddq_u16(s0, d0);
154 d1 = vrhaddq_u16(s1, d1);
155 d2 = vrhaddq_u16(s2, d2);
156 d3 = vrhaddq_u16(s3, d3);
157
158 vst1q_u16(dst, d0);
159 vst1q_u16(dst + 8, d1);
160 vst1q_u16(dst + 16, d2);
161 vst1q_u16(dst + 24, d3);
162
163 s0 = vld1q_u16(src + 32);
164 s1 = vld1q_u16(src + 40);
165 s2 = vld1q_u16(src + 48);
166 s3 = vld1q_u16(src + 56);
167 d0 = vld1q_u16(dst + 32);
168 d1 = vld1q_u16(dst + 40);
169 d2 = vld1q_u16(dst + 48);
170 d3 = vld1q_u16(dst + 56);
171
172 d0 = vrhaddq_u16(s0, d0);
173 d1 = vrhaddq_u16(s1, d1);
174 d2 = vrhaddq_u16(s2, d2);
175 d3 = vrhaddq_u16(s3, d3);
176
177 vst1q_u16(dst + 32, d0);
178 vst1q_u16(dst + 40, d1);
179 vst1q_u16(dst + 48, d2);
180 vst1q_u16(dst + 56, d3);
181 src += src_stride;
182 dst += dst_stride;
183 } while (--h);
184 }
185 }
186