1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>  // AVX2
13 #include "aom_dsp/x86/synonyms.h"
14 #include "aom_dsp/x86/synonyms_avx2.h"
15 #include "aom_dsp/x86/transpose_sse2.h"
16 
17 #include "config/av1_rtcd.h"
18 #include "av1/common/restoration.h"
19 #include "av1/encoder/pickrst.h"
20 
acc_stat_avx2(int32_t * dst,const uint8_t * src,const __m128i * shuffle,const __m256i * kl)21 static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
22                                  const __m128i *shuffle, const __m256i *kl) {
23   const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
24   const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
25   const __m256i dst0 = yy_loadu_256(dst);
26   const __m256i r0 = _mm256_add_epi32(dst0, d0);
27   yy_storeu_256(dst, r0);
28 }
29 
acc_stat_win7_one_line_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int dgd_stride,const __m128i * shuffle,int32_t * sumX,int32_t sumY[WIENER_WIN][WIENER_WIN],int32_t M_int[WIENER_WIN][WIENER_WIN],int32_t H_int[WIENER_WIN2][WIENER_WIN * 8])30 static INLINE void acc_stat_win7_one_line_avx2(
31     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
32     int dgd_stride, const __m128i *shuffle, int32_t *sumX,
33     int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
34     int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
35   int j, k, l;
36   const int wiener_win = WIENER_WIN;
37   for (j = h_start; j < h_end; j += 2) {
38     const uint8_t X1 = src[j];
39     const uint8_t X2 = src[j + 1];
40     *sumX += X1 + X2;
41     const uint8_t *dgd_ij = dgd + j;
42     for (k = 0; k < wiener_win; k++) {
43       const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
44       for (l = 0; l < wiener_win; l++) {
45         int32_t *H_ = &H_int[(l * wiener_win + k)][0];
46         const uint8_t D1 = dgd_ijk[l];
47         const uint8_t D2 = dgd_ijk[l + 1];
48         sumY[k][l] += D1 + D2;
49         M_int[k][l] += D1 * X1 + D2 * X2;
50 
51         const __m256i kl =
52             _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
53         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
54         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
55         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
56         acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
57         acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
58         acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
59         acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
60       }
61     }
62   }
63 }
64 
compute_stats_win7_opt_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,double * M,double * H)65 static INLINE void compute_stats_win7_opt_avx2(
66     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
67     int v_end, int dgd_stride, int src_stride, double *M, double *H) {
68   int i, j, k, l, m, n;
69   const int wiener_win = WIENER_WIN;
70   const int pixel_count = (h_end - h_start) * (v_end - v_start);
71   const int wiener_win2 = wiener_win * wiener_win;
72   const int wiener_halfwin = (wiener_win >> 1);
73   const double avg =
74       find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
75 
76   int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
77   int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
78   int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
79   int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
80   int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
81   int32_t sumX = 0;
82   const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
83 
84   const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
85   for (j = v_start; j < v_end; j += 64) {
86     const int vert_end = AOMMIN(64, v_end - j) + j;
87     for (i = j; i < vert_end; i++) {
88       acc_stat_win7_one_line_avx2(
89           dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
90           dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
91     }
92     for (k = 0; k < wiener_win; ++k) {
93       for (l = 0; l < wiener_win; ++l) {
94         M_int64[k][l] += M_int32[k][l];
95         M_int32[k][l] = 0;
96       }
97     }
98     for (k = 0; k < WIENER_WIN2; ++k) {
99       for (l = 0; l < WIENER_WIN * 8; ++l) {
100         H_int64[k][l] += H_int32[k][l];
101         H_int32[k][l] = 0;
102       }
103     }
104   }
105 
106   const double avg_square_sum = avg * avg * pixel_count;
107   for (k = 0; k < wiener_win; k++) {
108     for (l = 0; l < wiener_win; l++) {
109       const int32_t idx0 = l * wiener_win + k;
110       M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
111       double *H_ = H + idx0 * wiener_win2;
112       int64_t *H_int_ = &H_int64[idx0][0];
113       for (m = 0; m < wiener_win; m++) {
114         for (n = 0; n < wiener_win; n++) {
115           H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
116                                    avg * (sumY[k][l] + sumY[n][m]);
117         }
118       }
119     }
120   }
121 }
122 
acc_stat_win5_one_line_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int dgd_stride,const __m128i * shuffle,int32_t * sumX,int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8])123 static INLINE void acc_stat_win5_one_line_avx2(
124     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
125     int dgd_stride, const __m128i *shuffle, int32_t *sumX,
126     int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
127     int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
128     int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
129   int j, k, l;
130   const int wiener_win = WIENER_WIN_CHROMA;
131   for (j = h_start; j < h_end; j += 2) {
132     const uint8_t X1 = src[j];
133     const uint8_t X2 = src[j + 1];
134     *sumX += X1 + X2;
135     const uint8_t *dgd_ij = dgd + j;
136     for (k = 0; k < wiener_win; k++) {
137       const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
138       for (l = 0; l < wiener_win; l++) {
139         int32_t *H_ = &H_int[(l * wiener_win + k)][0];
140         const uint8_t D1 = dgd_ijk[l];
141         const uint8_t D2 = dgd_ijk[l + 1];
142         sumY[k][l] += D1 + D2;
143         M_int[k][l] += D1 * X1 + D2 * X2;
144 
145         const __m256i kl =
146             _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
147         acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
148         acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
149         acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
150         acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
151         acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
152       }
153     }
154   }
155 }
156 
compute_stats_win5_opt_avx2(const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,double * M,double * H)157 static INLINE void compute_stats_win5_opt_avx2(
158     const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
159     int v_end, int dgd_stride, int src_stride, double *M, double *H) {
160   int i, j, k, l, m, n;
161   const int wiener_win = WIENER_WIN_CHROMA;
162   const int pixel_count = (h_end - h_start) * (v_end - v_start);
163   const int wiener_win2 = wiener_win * wiener_win;
164   const int wiener_halfwin = (wiener_win >> 1);
165   const double avg =
166       find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
167 
168   int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
169   int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
170   int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
171   int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
172   int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
173   int32_t sumX = 0;
174   const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
175 
176   const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
177   for (j = v_start; j < v_end; j += 64) {
178     const int vert_end = AOMMIN(64, v_end - j) + j;
179     for (i = j; i < vert_end; i++) {
180       acc_stat_win5_one_line_avx2(
181           dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
182           dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
183     }
184     for (k = 0; k < wiener_win; ++k) {
185       for (l = 0; l < wiener_win; ++l) {
186         M_int64[k][l] += M_int32[k][l];
187         M_int32[k][l] = 0;
188       }
189     }
190     for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
191       for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
192         H_int64[k][l] += H_int32[k][l];
193         H_int32[k][l] = 0;
194       }
195     }
196   }
197 
198   const double avg_square_sum = avg * avg * pixel_count;
199   for (k = 0; k < wiener_win; k++) {
200     for (l = 0; l < wiener_win; l++) {
201       const int32_t idx0 = l * wiener_win + k;
202       M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
203       double *H_ = H + idx0 * wiener_win2;
204       int64_t *H_int_ = &H_int64[idx0][0];
205       for (m = 0; m < wiener_win; m++) {
206         for (n = 0; n < wiener_win; n++) {
207           H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
208                                    avg * (sumY[k][l] + sumY[n][m]);
209         }
210       }
211     }
212   }
213 }
214 
av1_compute_stats_avx2(int wiener_win,const uint8_t * dgd,const uint8_t * src,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,double * M,double * H)215 void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
216                             const uint8_t *src, int h_start, int h_end,
217                             int v_start, int v_end, int dgd_stride,
218                             int src_stride, double *M, double *H) {
219   if (wiener_win == WIENER_WIN) {
220     compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
221                                 dgd_stride, src_stride, M, H);
222   } else if (wiener_win == WIENER_WIN_CHROMA) {
223     compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
224                                 dgd_stride, src_stride, M, H);
225   } else {
226     av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
227                         dgd_stride, src_stride, M, H);
228   }
229 }
230 
pair_set_epi16(uint16_t a,uint16_t b)231 static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
232   return _mm256_set1_epi32(
233       (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
234 }
235 
av1_lowbd_pixel_proj_error_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int xq[2],const sgr_params_type * params)236 int64_t av1_lowbd_pixel_proj_error_avx2(
237     const uint8_t *src8, int width, int height, int src_stride,
238     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
239     int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
240   int i, j, k;
241   const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
242   const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
243   __m256i sum64 = _mm256_setzero_si256();
244   const uint8_t *src = src8;
245   const uint8_t *dat = dat8;
246   int64_t err = 0;
247   if (params->r[0] > 0 && params->r[1] > 0) {
248     __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
249     for (i = 0; i < height; ++i) {
250       __m256i sum32 = _mm256_setzero_si256();
251       for (j = 0; j <= width - 16; j += 16) {
252         const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
253         const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
254         const __m256i flt0_16b = _mm256_permute4x64_epi64(
255             _mm256_packs_epi32(yy_loadu_256(flt0 + j),
256                                yy_loadu_256(flt0 + j + 8)),
257             0xd8);
258         const __m256i flt1_16b = _mm256_permute4x64_epi64(
259             _mm256_packs_epi32(yy_loadu_256(flt1 + j),
260                                yy_loadu_256(flt1 + j + 8)),
261             0xd8);
262         const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
263         const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
264         const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
265         const __m256i v0 = _mm256_madd_epi16(
266             xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
267         const __m256i v1 = _mm256_madd_epi16(
268             xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
269         const __m256i vr0 =
270             _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
271         const __m256i vr1 =
272             _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
273         const __m256i e0 = _mm256_sub_epi16(
274             _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
275         const __m256i err0 = _mm256_madd_epi16(e0, e0);
276         sum32 = _mm256_add_epi32(sum32, err0);
277       }
278       for (k = j; k < width; ++k) {
279         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
280         int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
281         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
282         err += e * e;
283       }
284       dat += dat_stride;
285       src += src_stride;
286       flt0 += flt0_stride;
287       flt1 += flt1_stride;
288       const __m256i sum64_0 =
289           _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
290       const __m256i sum64_1 =
291           _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
292       sum64 = _mm256_add_epi64(sum64, sum64_0);
293       sum64 = _mm256_add_epi64(sum64, sum64_1);
294     }
295   } else if (params->r[0] > 0) {
296     __m256i xq_coeff =
297         pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
298     for (i = 0; i < height; ++i) {
299       __m256i sum32 = _mm256_setzero_si256();
300       for (j = 0; j <= width - 16; j += 16) {
301         const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
302         const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
303         const __m256i flt0_16b = _mm256_permute4x64_epi64(
304             _mm256_packs_epi32(yy_loadu_256(flt0 + j),
305                                yy_loadu_256(flt0 + j + 8)),
306             0xd8);
307         const __m256i v0 =
308             _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
309         const __m256i v1 =
310             _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
311         const __m256i vr0 =
312             _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
313         const __m256i vr1 =
314             _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
315         const __m256i e0 = _mm256_sub_epi16(
316             _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
317         const __m256i err0 = _mm256_madd_epi16(e0, e0);
318         sum32 = _mm256_add_epi32(sum32, err0);
319       }
320       for (k = j; k < width; ++k) {
321         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
322         int32_t v = xq[0] * (flt0[k] - u);
323         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
324         err += e * e;
325       }
326       dat += dat_stride;
327       src += src_stride;
328       flt0 += flt0_stride;
329       const __m256i sum64_0 =
330           _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
331       const __m256i sum64_1 =
332           _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
333       sum64 = _mm256_add_epi64(sum64, sum64_0);
334       sum64 = _mm256_add_epi64(sum64, sum64_1);
335     }
336   } else if (params->r[1] > 0) {
337     __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
338     for (i = 0; i < height; ++i) {
339       __m256i sum32 = _mm256_setzero_si256();
340       for (j = 0; j <= width - 16; j += 16) {
341         const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
342         const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
343         const __m256i flt1_16b = _mm256_permute4x64_epi64(
344             _mm256_packs_epi32(yy_loadu_256(flt1 + j),
345                                yy_loadu_256(flt1 + j + 8)),
346             0xd8);
347         const __m256i v0 =
348             _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
349         const __m256i v1 =
350             _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
351         const __m256i vr0 =
352             _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
353         const __m256i vr1 =
354             _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
355         const __m256i e0 = _mm256_sub_epi16(
356             _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
357         const __m256i err0 = _mm256_madd_epi16(e0, e0);
358         sum32 = _mm256_add_epi32(sum32, err0);
359       }
360       for (k = j; k < width; ++k) {
361         const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
362         int32_t v = xq[1] * (flt1[k] - u);
363         const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
364         err += e * e;
365       }
366       dat += dat_stride;
367       src += src_stride;
368       flt1 += flt1_stride;
369       const __m256i sum64_0 =
370           _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
371       const __m256i sum64_1 =
372           _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
373       sum64 = _mm256_add_epi64(sum64, sum64_0);
374       sum64 = _mm256_add_epi64(sum64, sum64_1);
375     }
376   } else {
377     __m256i sum32 = _mm256_setzero_si256();
378     for (i = 0; i < height; ++i) {
379       for (j = 0; j <= width - 16; j += 16) {
380         const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
381         const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
382         const __m256i diff0 = _mm256_sub_epi16(d0, s0);
383         const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
384         sum32 = _mm256_add_epi32(sum32, err0);
385       }
386       for (k = j; k < width; ++k) {
387         const int32_t e = (int32_t)(dat[k]) - src[k];
388         err += e * e;
389       }
390       dat += dat_stride;
391       src += src_stride;
392     }
393     const __m256i sum64_0 =
394         _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
395     const __m256i sum64_1 =
396         _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
397     sum64 = _mm256_add_epi64(sum64_0, sum64_1);
398   }
399   int64_t sum[4];
400   yy_storeu_256(sum, sum64);
401   err += sum[0] + sum[1] + sum[2] + sum[3];
402   return err;
403 }
404