1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/x86/convolve_sse2.h"
19 #include "av1/common/convolve.h"
20 
av1_convolve_2d_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)21 void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
22                              int dst_stride, int w, int h,
23                              const InterpFilterParams *filter_params_x,
24                              const InterpFilterParams *filter_params_y,
25                              const int subpel_x_q4, const int subpel_y_q4,
26                              ConvolveParams *conv_params) {
27   const int bd = 8;
28 
29   DECLARE_ALIGNED(16, int16_t,
30                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
31   int im_h = h + filter_params_y->taps - 1;
32   int im_stride = MAX_SB_SIZE;
33   int i, j;
34   const int fo_vert = filter_params_y->taps / 2 - 1;
35   const int fo_horiz = filter_params_x->taps / 2 - 1;
36   const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
37 
38   const __m128i zero = _mm_setzero_si128();
39   const int bits =
40       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
41   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
42 
43   assert(conv_params->round_0 > 0);
44 
45   /* Horizontal filter */
46   {
47     const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
48         filter_params_x, subpel_x_q4 & SUBPEL_MASK);
49     const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
50 
51     // coeffs 0 1 0 1 2 3 2 3
52     const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
53     // coeffs 4 5 4 5 6 7 6 7
54     const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
55 
56     // coeffs 0 1 0 1 0 1 0 1
57     const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
58     // coeffs 2 3 2 3 2 3 2 3
59     const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
60     // coeffs 4 5 4 5 4 5 4 5
61     const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
62     // coeffs 6 7 6 7 6 7 6 7
63     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
64 
65     const __m128i round_const = _mm_set1_epi32(
66         (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
67     const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
68 
69     for (i = 0; i < im_h; ++i) {
70       for (j = 0; j < w; j += 8) {
71         const __m128i data =
72             _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
73 
74         // Filter even-index pixels
75         const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
76         const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
77         const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
78         const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
79         const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
80         const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
81         const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
82         const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
83 
84         __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
85                                          _mm_add_epi32(res_2, res_6));
86         res_even =
87             _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
88 
89         // Filter odd-index pixels
90         const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
91         const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
92         const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
93         const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
94         const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
95         const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
96         const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
97         const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
98 
99         __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
100                                         _mm_add_epi32(res_3, res_7));
101         res_odd =
102             _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
103 
104         // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
105         __m128i res = _mm_packs_epi32(res_even, res_odd);
106         _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
107       }
108     }
109   }
110 
111   /* Vertical filter */
112   {
113     const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
114         filter_params_y, subpel_y_q4 & SUBPEL_MASK);
115     const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
116 
117     // coeffs 0 1 0 1 2 3 2 3
118     const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
119     // coeffs 4 5 4 5 6 7 6 7
120     const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
121 
122     // coeffs 0 1 0 1 0 1 0 1
123     const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
124     // coeffs 2 3 2 3 2 3 2 3
125     const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
126     // coeffs 4 5 4 5 4 5 4 5
127     const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
128     // coeffs 6 7 6 7 6 7 6 7
129     const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
130 
131     const __m128i sum_round =
132         _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
133     const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
134 
135     const __m128i round_const = _mm_set1_epi32(
136         ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
137         ((1 << (offset_bits - conv_params->round_1)) >> 1));
138     const __m128i round_shift = _mm_cvtsi32_si128(bits);
139 
140     for (i = 0; i < h; ++i) {
141       for (j = 0; j < w; j += 8) {
142         // Filter even-index pixels
143         const int16_t *data = &im_block[i * im_stride + j];
144         const __m128i src_0 =
145             _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
146                                *(__m128i *)(data + 1 * im_stride));
147         const __m128i src_2 =
148             _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
149                                *(__m128i *)(data + 3 * im_stride));
150         const __m128i src_4 =
151             _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
152                                *(__m128i *)(data + 5 * im_stride));
153         const __m128i src_6 =
154             _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
155                                *(__m128i *)(data + 7 * im_stride));
156 
157         const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
158         const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
159         const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
160         const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
161 
162         const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
163                                                _mm_add_epi32(res_4, res_6));
164 
165         // Filter odd-index pixels
166         const __m128i src_1 =
167             _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
168                                *(__m128i *)(data + 1 * im_stride));
169         const __m128i src_3 =
170             _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
171                                *(__m128i *)(data + 3 * im_stride));
172         const __m128i src_5 =
173             _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
174                                *(__m128i *)(data + 5 * im_stride));
175         const __m128i src_7 =
176             _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
177                                *(__m128i *)(data + 7 * im_stride));
178 
179         const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
180         const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
181         const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
182         const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
183 
184         const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
185                                               _mm_add_epi32(res_5, res_7));
186 
187         // Rearrange pixels back into the order 0 ... 7
188         const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
189         const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
190 
191         __m128i res_lo_round =
192             _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
193         __m128i res_hi_round =
194             _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
195 
196         res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
197                                      round_shift);
198         res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
199                                      round_shift);
200 
201         const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
202         const __m128i res = _mm_packus_epi16(res16, res16);
203 
204         // Accumulate values into the destination buffer
205         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
206 
207         if (w == 2) {
208           *(uint16_t *)p = _mm_cvtsi128_si32(res);
209         } else if (w == 4) {
210           *(uint32_t *)p = _mm_cvtsi128_si32(res);
211         } else {
212           _mm_storel_epi64(p, res);
213         }
214       }
215     }
216   }
217 }
218 
copy_128(const uint8_t * src,uint8_t * dst)219 static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
220   __m128i s[8];
221   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
222   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
223   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
224   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
225   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
226   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
227   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
228   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
229   _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
230   _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
231   _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
232   _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
233   _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
234   _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
235   _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
236   _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
237 }
238 
av1_convolve_2d_copy_sr_sse2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)239 void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
240                                   uint8_t *dst, int dst_stride, int w, int h,
241                                   const InterpFilterParams *filter_params_x,
242                                   const InterpFilterParams *filter_params_y,
243                                   const int subpel_x_q4, const int subpel_y_q4,
244                                   ConvolveParams *conv_params) {
245   (void)filter_params_x;
246   (void)filter_params_y;
247   (void)subpel_x_q4;
248   (void)subpel_y_q4;
249   (void)conv_params;
250 
251   if (w >= 16) {
252     assert(!((intptr_t)dst % 16));
253     assert(!(dst_stride % 16));
254   }
255 
256   if (w == 2) {
257     do {
258       memcpy(dst, src, 2 * sizeof(*src));
259       src += src_stride;
260       dst += dst_stride;
261       memcpy(dst, src, 2 * sizeof(*src));
262       src += src_stride;
263       dst += dst_stride;
264       h -= 2;
265     } while (h);
266   } else if (w == 4) {
267     do {
268       memcpy(dst, src, 4 * sizeof(*src));
269       src += src_stride;
270       dst += dst_stride;
271       memcpy(dst, src, 4 * sizeof(*src));
272       src += src_stride;
273       dst += dst_stride;
274       h -= 2;
275     } while (h);
276   } else if (w == 8) {
277     do {
278       __m128i s[2];
279       s[0] = _mm_loadl_epi64((__m128i *)src);
280       src += src_stride;
281       s[1] = _mm_loadl_epi64((__m128i *)src);
282       src += src_stride;
283       _mm_storel_epi64((__m128i *)dst, s[0]);
284       dst += dst_stride;
285       _mm_storel_epi64((__m128i *)dst, s[1]);
286       dst += dst_stride;
287       h -= 2;
288     } while (h);
289   } else if (w == 16) {
290     do {
291       __m128i s[2];
292       s[0] = _mm_loadu_si128((__m128i *)src);
293       src += src_stride;
294       s[1] = _mm_loadu_si128((__m128i *)src);
295       src += src_stride;
296       _mm_store_si128((__m128i *)dst, s[0]);
297       dst += dst_stride;
298       _mm_store_si128((__m128i *)dst, s[1]);
299       dst += dst_stride;
300       h -= 2;
301     } while (h);
302   } else if (w == 32) {
303     do {
304       __m128i s[4];
305       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
306       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
307       src += src_stride;
308       s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
309       s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
310       src += src_stride;
311       _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
312       _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
313       dst += dst_stride;
314       _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
315       _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
316       dst += dst_stride;
317       h -= 2;
318     } while (h);
319   } else if (w == 64) {
320     do {
321       __m128i s[8];
322       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
323       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
324       s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
325       s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
326       src += src_stride;
327       s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
328       s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
329       s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
330       s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
331       src += src_stride;
332       _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
333       _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
334       _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
335       _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
336       dst += dst_stride;
337       _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
338       _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
339       _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
340       _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
341       dst += dst_stride;
342       h -= 2;
343     } while (h);
344   } else {
345     do {
346       copy_128(src, dst);
347       src += src_stride;
348       dst += dst_stride;
349       copy_128(src, dst);
350       src += src_stride;
351       dst += dst_stride;
352       h -= 2;
353     } while (h);
354   }
355 }
356 
av1_jnt_convolve_2d_copy_sse2(const uint8_t * src,int src_stride,uint8_t * dst0,int dst_stride0,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)357 void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
358                                    uint8_t *dst0, int dst_stride0, int w, int h,
359                                    const InterpFilterParams *filter_params_x,
360                                    const InterpFilterParams *filter_params_y,
361                                    const int subpel_x_q4, const int subpel_y_q4,
362                                    ConvolveParams *conv_params) {
363   const int bd = 8;
364   CONV_BUF_TYPE *dst = conv_params->dst;
365   int dst_stride = conv_params->dst_stride;
366   (void)filter_params_x;
367   (void)filter_params_y;
368   (void)subpel_x_q4;
369   (void)subpel_y_q4;
370 
371   const int bits =
372       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
373   const int do_average = conv_params->do_average;
374   const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
375   const __m128i zero = _mm_setzero_si128();
376   const __m128i left_shift = _mm_cvtsi32_si128(bits);
377   int i, j;
378 
379   const int w0 = conv_params->fwd_offset;
380   const int w1 = conv_params->bck_offset;
381   const __m128i wt0 = _mm_set1_epi16(w0);
382   const __m128i wt1 = _mm_set1_epi16(w1);
383   const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
384 
385   const int offset_0 =
386       bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
387   const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
388   const __m128i offset_const = _mm_set1_epi16(offset);
389   const int rounding_shift =
390       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
391   const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
392 
393   assert((w % 4) == 0);
394 
395   if (!(w % 16)) {
396     for (i = 0; i < h; ++i) {
397       for (j = 0; j < w; j += 16) {
398         const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
399 
400         const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
401         const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
402 
403         const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
404         const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
405 
406         const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
407         const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
408 
409         if (do_average) {
410           const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
411           const __m128i data_ref_0_hi =
412               _mm_loadu_si128((__m128i *)(&dst[j + 8]));
413 
414           const __m128i comp_avg_res_lo =
415               comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
416 
417           const __m128i round_result_lo = convolve_rounding(
418               &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
419 
420           const __m128i comp_avg_res_hi =
421               comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
422 
423           const __m128i round_result_hi = convolve_rounding(
424               &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
425 
426           const __m128i res_8 =
427               _mm_packus_epi16(round_result_lo, round_result_hi);
428 
429           _mm_store_si128((__m128i *)(&dst0[j]), res_8);
430         } else {
431           _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
432           _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
433         }
434       }
435       src += src_stride;
436       dst += dst_stride;
437       dst0 += dst_stride0;
438     }
439   } else {
440     for (i = 0; i < h; ++i) {
441       for (j = 0; j < w; j += 8) {
442         const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
443         const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
444 
445         const __m128i res = _mm_sll_epi16(d16_0, left_shift);
446         const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
447 
448         if (do_average) {
449           const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
450 
451           const __m128i comp_avg_res =
452               comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
453 
454           const __m128i round_result = convolve_rounding(
455               &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
456 
457           const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
458 
459           if (w > 4)
460             _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
461           else
462             *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
463         } else {
464           _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
465         }
466       }
467       src += src_stride;
468       dst += dst_stride;
469       dst0 += dst_stride0;
470     }
471   }
472 }
473