1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h> // AVX2
13 #include "aom_dsp/x86/synonyms.h"
14 #include "aom_dsp/x86/synonyms_avx2.h"
15 #include "aom_dsp/x86/transpose_sse2.h"
16
17 #include "config/av1_rtcd.h"
18 #include "av1/common/restoration.h"
19 #include "av1/encoder/pickrst.h"
20
acc_stat_avx2(int32_t * dst,const uint8_t * src,const __m128i * shuffle,const __m256i * kl)21 static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
22 const __m128i *shuffle, const __m256i *kl) {
23 const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
24 const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
25 const __m256i dst0 = yy_loadu_256(dst);
26 const __m256i r0 = _mm256_add_epi32(dst0, d0);
27 yy_storeu_256(dst, r0);
28 }
29
acc_stat_win7_one_line_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int dgd_stride,const __m128i * shuffle,int32_t * sumX,int32_t sumY[WIENER_WIN][WIENER_WIN],int32_t M_int[WIENER_WIN][WIENER_WIN],int32_t H_int[WIENER_WIN2][WIENER_WIN * 8])30 static INLINE void acc_stat_win7_one_line_avx2(
31 const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
32 int dgd_stride, const __m128i *shuffle, int32_t *sumX,
33 int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
34 int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
35 int j, k, l;
36 const int wiener_win = WIENER_WIN;
37 for (j = h_start; j < h_end; j += 2) {
38 const uint8_t X1 = src[j];
39 const uint8_t X2 = src[j + 1];
40 *sumX += X1 + X2;
41 const uint8_t *dgd_ij = dgd + j;
42 for (k = 0; k < wiener_win; k++) {
43 const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
44 for (l = 0; l < wiener_win; l++) {
45 int32_t *H_ = &H_int[(l * wiener_win + k)][0];
46 const uint8_t D1 = dgd_ijk[l];
47 const uint8_t D2 = dgd_ijk[l + 1];
48 sumY[k][l] += D1 + D2;
49 M_int[k][l] += D1 * X1 + D2 * X2;
50
51 const __m256i kl =
52 _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
53 acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
54 acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
55 acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
56 acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
57 acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
58 acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
59 acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
60 }
61 }
62 }
63 }
64
compute_stats_win7_opt_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,double * M,double * H)65 static INLINE void compute_stats_win7_opt_avx2(
66 const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
67 int v_end, int dgd_stride, int src_stride, double *M, double *H) {
68 int i, j, k, l, m, n;
69 const int wiener_win = WIENER_WIN;
70 const int pixel_count = (h_end - h_start) * (v_end - v_start);
71 const int wiener_win2 = wiener_win * wiener_win;
72 const int wiener_halfwin = (wiener_win >> 1);
73 const double avg =
74 find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
75
76 int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
77 int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
78 int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
79 int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
80 int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
81 int32_t sumX = 0;
82 const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
83
84 const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
85 for (j = v_start; j < v_end; j += 64) {
86 const int vert_end = AOMMIN(64, v_end - j) + j;
87 for (i = j; i < vert_end; i++) {
88 acc_stat_win7_one_line_avx2(
89 dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
90 dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
91 }
92 for (k = 0; k < wiener_win; ++k) {
93 for (l = 0; l < wiener_win; ++l) {
94 M_int64[k][l] += M_int32[k][l];
95 M_int32[k][l] = 0;
96 }
97 }
98 for (k = 0; k < WIENER_WIN2; ++k) {
99 for (l = 0; l < WIENER_WIN * 8; ++l) {
100 H_int64[k][l] += H_int32[k][l];
101 H_int32[k][l] = 0;
102 }
103 }
104 }
105
106 const double avg_square_sum = avg * avg * pixel_count;
107 for (k = 0; k < wiener_win; k++) {
108 for (l = 0; l < wiener_win; l++) {
109 const int32_t idx0 = l * wiener_win + k;
110 M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
111 double *H_ = H + idx0 * wiener_win2;
112 int64_t *H_int_ = &H_int64[idx0][0];
113 for (m = 0; m < wiener_win; m++) {
114 for (n = 0; n < wiener_win; n++) {
115 H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
116 avg * (sumY[k][l] + sumY[n][m]);
117 }
118 }
119 }
120 }
121 }
122
acc_stat_win5_one_line_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int dgd_stride,const __m128i * shuffle,int32_t * sumX,int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8])123 static INLINE void acc_stat_win5_one_line_avx2(
124 const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
125 int dgd_stride, const __m128i *shuffle, int32_t *sumX,
126 int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
127 int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
128 int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
129 int j, k, l;
130 const int wiener_win = WIENER_WIN_CHROMA;
131 for (j = h_start; j < h_end; j += 2) {
132 const uint8_t X1 = src[j];
133 const uint8_t X2 = src[j + 1];
134 *sumX += X1 + X2;
135 const uint8_t *dgd_ij = dgd + j;
136 for (k = 0; k < wiener_win; k++) {
137 const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
138 for (l = 0; l < wiener_win; l++) {
139 int32_t *H_ = &H_int[(l * wiener_win + k)][0];
140 const uint8_t D1 = dgd_ijk[l];
141 const uint8_t D2 = dgd_ijk[l + 1];
142 sumY[k][l] += D1 + D2;
143 M_int[k][l] += D1 * X1 + D2 * X2;
144
145 const __m256i kl =
146 _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
147 acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
148 acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
149 acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
150 acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
151 acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
152 }
153 }
154 }
155 }
156
compute_stats_win5_opt_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,double * M,double * H)157 static INLINE void compute_stats_win5_opt_avx2(
158 const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
159 int v_end, int dgd_stride, int src_stride, double *M, double *H) {
160 int i, j, k, l, m, n;
161 const int wiener_win = WIENER_WIN_CHROMA;
162 const int pixel_count = (h_end - h_start) * (v_end - v_start);
163 const int wiener_win2 = wiener_win * wiener_win;
164 const int wiener_halfwin = (wiener_win >> 1);
165 const double avg =
166 find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
167
168 int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
169 int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
170 int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
171 int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
172 int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
173 int32_t sumX = 0;
174 const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
175
176 const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
177 for (j = v_start; j < v_end; j += 64) {
178 const int vert_end = AOMMIN(64, v_end - j) + j;
179 for (i = j; i < vert_end; i++) {
180 acc_stat_win5_one_line_avx2(
181 dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
182 dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
183 }
184 for (k = 0; k < wiener_win; ++k) {
185 for (l = 0; l < wiener_win; ++l) {
186 M_int64[k][l] += M_int32[k][l];
187 M_int32[k][l] = 0;
188 }
189 }
190 for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
191 for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
192 H_int64[k][l] += H_int32[k][l];
193 H_int32[k][l] = 0;
194 }
195 }
196 }
197
198 const double avg_square_sum = avg * avg * pixel_count;
199 for (k = 0; k < wiener_win; k++) {
200 for (l = 0; l < wiener_win; l++) {
201 const int32_t idx0 = l * wiener_win + k;
202 M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
203 double *H_ = H + idx0 * wiener_win2;
204 int64_t *H_int_ = &H_int64[idx0][0];
205 for (m = 0; m < wiener_win; m++) {
206 for (n = 0; n < wiener_win; n++) {
207 H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
208 avg * (sumY[k][l] + sumY[n][m]);
209 }
210 }
211 }
212 }
213 }
214
av1_compute_stats_avx2(int wiener_win,const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,double * M,double * H)215 void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
216 const uint8_t *src, int h_start, int h_end,
217 int v_start, int v_end, int dgd_stride,
218 int src_stride, double *M, double *H) {
219 if (wiener_win == WIENER_WIN) {
220 compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
221 dgd_stride, src_stride, M, H);
222 } else if (wiener_win == WIENER_WIN_CHROMA) {
223 compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
224 dgd_stride, src_stride, M, H);
225 } else {
226 av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
227 dgd_stride, src_stride, M, H);
228 }
229 }
230
pair_set_epi16(uint16_t a,uint16_t b)231 static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
232 return _mm256_set1_epi32(
233 (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
234 }
235
av1_lowbd_pixel_proj_error_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int xq[2],const sgr_params_type * params)236 int64_t av1_lowbd_pixel_proj_error_avx2(
237 const uint8_t *src8, int width, int height, int src_stride,
238 const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
239 int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
240 int i, j, k;
241 const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
242 const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
243 __m256i sum64 = _mm256_setzero_si256();
244 const uint8_t *src = src8;
245 const uint8_t *dat = dat8;
246 int64_t err = 0;
247 if (params->r[0] > 0 && params->r[1] > 0) {
248 __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
249 for (i = 0; i < height; ++i) {
250 __m256i sum32 = _mm256_setzero_si256();
251 for (j = 0; j <= width - 16; j += 16) {
252 const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
253 const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
254 const __m256i flt0_16b = _mm256_permute4x64_epi64(
255 _mm256_packs_epi32(yy_loadu_256(flt0 + j),
256 yy_loadu_256(flt0 + j + 8)),
257 0xd8);
258 const __m256i flt1_16b = _mm256_permute4x64_epi64(
259 _mm256_packs_epi32(yy_loadu_256(flt1 + j),
260 yy_loadu_256(flt1 + j + 8)),
261 0xd8);
262 const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
263 const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
264 const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
265 const __m256i v0 = _mm256_madd_epi16(
266 xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
267 const __m256i v1 = _mm256_madd_epi16(
268 xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
269 const __m256i vr0 =
270 _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
271 const __m256i vr1 =
272 _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
273 const __m256i e0 = _mm256_sub_epi16(
274 _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
275 const __m256i err0 = _mm256_madd_epi16(e0, e0);
276 sum32 = _mm256_add_epi32(sum32, err0);
277 }
278 for (k = j; k < width; ++k) {
279 const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
280 int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
281 const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
282 err += e * e;
283 }
284 dat += dat_stride;
285 src += src_stride;
286 flt0 += flt0_stride;
287 flt1 += flt1_stride;
288 const __m256i sum64_0 =
289 _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
290 const __m256i sum64_1 =
291 _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
292 sum64 = _mm256_add_epi64(sum64, sum64_0);
293 sum64 = _mm256_add_epi64(sum64, sum64_1);
294 }
295 } else if (params->r[0] > 0) {
296 __m256i xq_coeff =
297 pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
298 for (i = 0; i < height; ++i) {
299 __m256i sum32 = _mm256_setzero_si256();
300 for (j = 0; j <= width - 16; j += 16) {
301 const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
302 const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
303 const __m256i flt0_16b = _mm256_permute4x64_epi64(
304 _mm256_packs_epi32(yy_loadu_256(flt0 + j),
305 yy_loadu_256(flt0 + j + 8)),
306 0xd8);
307 const __m256i v0 =
308 _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
309 const __m256i v1 =
310 _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
311 const __m256i vr0 =
312 _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
313 const __m256i vr1 =
314 _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
315 const __m256i e0 = _mm256_sub_epi16(
316 _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
317 const __m256i err0 = _mm256_madd_epi16(e0, e0);
318 sum32 = _mm256_add_epi32(sum32, err0);
319 }
320 for (k = j; k < width; ++k) {
321 const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
322 int32_t v = xq[0] * (flt0[k] - u);
323 const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
324 err += e * e;
325 }
326 dat += dat_stride;
327 src += src_stride;
328 flt0 += flt0_stride;
329 const __m256i sum64_0 =
330 _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
331 const __m256i sum64_1 =
332 _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
333 sum64 = _mm256_add_epi64(sum64, sum64_0);
334 sum64 = _mm256_add_epi64(sum64, sum64_1);
335 }
336 } else if (params->r[1] > 0) {
337 __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
338 for (i = 0; i < height; ++i) {
339 __m256i sum32 = _mm256_setzero_si256();
340 for (j = 0; j <= width - 16; j += 16) {
341 const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
342 const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
343 const __m256i flt1_16b = _mm256_permute4x64_epi64(
344 _mm256_packs_epi32(yy_loadu_256(flt1 + j),
345 yy_loadu_256(flt1 + j + 8)),
346 0xd8);
347 const __m256i v0 =
348 _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
349 const __m256i v1 =
350 _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
351 const __m256i vr0 =
352 _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
353 const __m256i vr1 =
354 _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
355 const __m256i e0 = _mm256_sub_epi16(
356 _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
357 const __m256i err0 = _mm256_madd_epi16(e0, e0);
358 sum32 = _mm256_add_epi32(sum32, err0);
359 }
360 for (k = j; k < width; ++k) {
361 const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
362 int32_t v = xq[1] * (flt1[k] - u);
363 const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
364 err += e * e;
365 }
366 dat += dat_stride;
367 src += src_stride;
368 flt1 += flt1_stride;
369 const __m256i sum64_0 =
370 _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
371 const __m256i sum64_1 =
372 _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
373 sum64 = _mm256_add_epi64(sum64, sum64_0);
374 sum64 = _mm256_add_epi64(sum64, sum64_1);
375 }
376 } else {
377 __m256i sum32 = _mm256_setzero_si256();
378 for (i = 0; i < height; ++i) {
379 for (j = 0; j <= width - 16; j += 16) {
380 const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
381 const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
382 const __m256i diff0 = _mm256_sub_epi16(d0, s0);
383 const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
384 sum32 = _mm256_add_epi32(sum32, err0);
385 }
386 for (k = j; k < width; ++k) {
387 const int32_t e = (int32_t)(dat[k]) - src[k];
388 err += e * e;
389 }
390 dat += dat_stride;
391 src += src_stride;
392 }
393 const __m256i sum64_0 =
394 _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
395 const __m256i sum64_1 =
396 _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
397 sum64 = _mm256_add_epi64(sum64_0, sum64_1);
398 }
399 int64_t sum[4];
400 yy_storeu_256(sum, sum64);
401 err += sum[0] + sum[1] + sum[2] + sum[3];
402 return err;
403 }
404