1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "aom_dsp/aom_dsp_common.h"
17 #include "aom_dsp/x86/convolve_common_intrin.h"
18 #include "aom_dsp/x86/convolve_avx2.h"
19 #include "aom_dsp/x86/synonyms.h"
20 
av1_convolve_y_sr_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)21 void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
22                             int dst_stride, int w, int h,
23                             const InterpFilterParams *filter_params_y,
24                             const int subpel_y_qn) {
25   int i, j, vert_tap = SUBPEL_TAPS;
26   // right shift is F-1 because we are already dividing
27   // filter co-efficients by 2
28   const int right_shift_bits = (FILTER_BITS - 1);
29   __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
30   __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1);
31 
32   __m256i coeffs[6], s[12];
33   __m128i d[10];
34 
35   // Condition for checking valid vert_filt taps
36   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
37       filter_params_y, subpel_y_qn & SUBPEL_MASK);
38   if (filter_params_y->taps == 12) {
39     vert_tap = 12;
40   } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
41     vert_tap = 4;
42   } else if (!(filter[0] | filter[7])) {
43     vert_tap = 6;
44   }
45 
46   if (vert_tap == 6)
47     prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
48   else if (vert_tap == 12) {
49     prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs);
50   } else {
51     prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
52   }
53 
54   // vert_filt as 4 tap
55   if (vert_tap == 4) {
56     const int fo_vert = 1;
57     const uint8_t *const src_ptr = src - fo_vert * src_stride;
58     for (j = 0; j < w; j += 16) {
59       const uint8_t *data = &src_ptr[j];
60       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
61       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
62       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
63       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
64       d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
65 
66       // Load lines a and b. Line a to lower 128, line b to upper 128
67       const __m256i src_01a = _mm256_permute2x128_si256(
68           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
69 
70       const __m256i src_12a = _mm256_permute2x128_si256(
71           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
72 
73       const __m256i src_23a = _mm256_permute2x128_si256(
74           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
75 
76       const __m256i src_34a = _mm256_permute2x128_si256(
77           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
78 
79       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
80       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
81 
82       s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
83       s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
84 
85       for (i = 0; i < h; i += 2) {
86         data = &src_ptr[i * src_stride + j];
87         d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
88         const __m256i src_45a = _mm256_permute2x128_si256(
89             _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
90 
91         d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
92         const __m256i src_56a = _mm256_permute2x128_si256(
93             _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
94 
95         s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
96         s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
97 
98         const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
99         /* rounding code */
100         // shift by F - 1
101         const __m256i res_16b_lo = _mm256_sra_epi16(
102             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
103         // 8 bit conversion and saturation to uint8
104         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
105 
106         if (w - j > 8) {
107           const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
108 
109           /* rounding code */
110           // shift by F - 1
111           const __m256i res_16b_hi = _mm256_sra_epi16(
112               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
113           // 8 bit conversion and saturation to uint8
114           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
115 
116           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
117 
118           const __m128i res_0 = _mm256_castsi256_si128(res_a);
119           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
120 
121           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
122           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
123                            res_1);
124         } else {
125           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
126           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
127           if (w - j > 4) {
128             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
129             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
130                              res_1);
131           } else if (w - j > 2) {
132             xx_storel_32(&dst[i * dst_stride + j], res_0);
133             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
134           } else {
135             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
136             __m128i *const p_1 =
137                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
138             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
139             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
140           }
141         }
142         s[0] = s[1];
143         s[1] = s[2];
144 
145         s[3] = s[4];
146         s[4] = s[5];
147       }
148     }
149   } else if (vert_tap == 6) {
150     const int fo_vert = vert_tap / 2 - 1;
151     const uint8_t *const src_ptr = src - fo_vert * src_stride;
152 
153     for (j = 0; j < w; j += 16) {
154       const uint8_t *data = &src_ptr[j];
155       __m256i src6;
156 
157       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
158       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
159       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
160       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
161       // Load lines a and b. Line a to lower 128, line b to upper 128
162       const __m256i src_01a = _mm256_permute2x128_si256(
163           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
164 
165       const __m256i src_12a = _mm256_permute2x128_si256(
166           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
167 
168       const __m256i src_23a = _mm256_permute2x128_si256(
169           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
170 
171       src6 = _mm256_castsi128_si256(
172           _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
173       const __m256i src_34a =
174           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
175 
176       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
177       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
178 
179       s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
180       s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
181 
182       for (i = 0; i < h; i += 2) {
183         data = &src_ptr[i * src_stride + j];
184         const __m256i src_45a = _mm256_permute2x128_si256(
185             src6,
186             _mm256_castsi128_si256(
187                 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
188             0x20);
189 
190         src6 = _mm256_castsi128_si256(
191             _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
192         const __m256i src_56a = _mm256_permute2x128_si256(
193             _mm256_castsi128_si256(
194                 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
195             src6, 0x20);
196 
197         s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
198         s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
199 
200         const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
201 
202         /* rounding code */
203         // shift by F - 1
204         const __m256i res_16b_lo = _mm256_sra_epi16(
205             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
206         // 8 bit conversion and saturation to uint8
207         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
208 
209         if (w - j > 8) {
210           const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
211 
212           /* rounding code */
213           // shift by F - 1
214           const __m256i res_16b_hi = _mm256_sra_epi16(
215               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
216           // 8 bit conversion and saturation to uint8
217           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
218 
219           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
220 
221           const __m128i res_0 = _mm256_castsi256_si128(res_a);
222           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
223 
224           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
225           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
226                            res_1);
227         } else {
228           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
229           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
230           if (w - j > 4) {
231             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
232             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
233                              res_1);
234           } else if (w - j > 2) {
235             xx_storel_32(&dst[i * dst_stride + j], res_0);
236             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
237           } else {
238             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
239             __m128i *const p_1 =
240                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
241             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
242             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
243           }
244         }
245         s[0] = s[1];
246         s[1] = s[2];
247         s[3] = s[4];
248         s[4] = s[5];
249       }
250     }
251   } else if (vert_tap == 12) {  // vert_tap == 12
252     const int fo_vert = filter_params_y->taps / 2 - 1;
253     const uint8_t *const src_ptr = src - fo_vert * src_stride;
254     const __m256i v_zero = _mm256_setzero_si256();
255     right_shift = _mm_cvtsi32_si128(FILTER_BITS);
256     right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1);
257 
258     for (j = 0; j < w; j += 8) {
259       const uint8_t *data = &src_ptr[j];
260       __m256i src10;
261 
262       d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride));
263       d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride));
264       d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride));
265       d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride));
266       d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride));
267       d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride));
268       d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
269       d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride));
270       d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
271       d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride));
272       // Load lines a and b. Line a to lower 128, line b to upper 128
273       const __m256i src_01a = _mm256_permute2x128_si256(
274           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
275 
276       const __m256i src_12a = _mm256_permute2x128_si256(
277           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
278 
279       const __m256i src_23a = _mm256_permute2x128_si256(
280           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
281 
282       const __m256i src_34a = _mm256_permute2x128_si256(
283           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
284 
285       const __m256i src_45a = _mm256_permute2x128_si256(
286           _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
287 
288       const __m256i src_56a = _mm256_permute2x128_si256(
289           _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20);
290 
291       const __m256i src_67a = _mm256_permute2x128_si256(
292           _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20);
293 
294       const __m256i src_78a = _mm256_permute2x128_si256(
295           _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20);
296 
297       const __m256i src_89a = _mm256_permute2x128_si256(
298           _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20);
299 
300       src10 = _mm256_castsi128_si256(
301           _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)));
302       const __m256i src_910a =
303           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20);
304 
305       const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero);
306       const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero);
307       const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero);
308       const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero);
309       const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero);
310       const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero);
311       const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero);
312       const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero);
313       const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero);
314       const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero);
315 
316       s[0] = _mm256_unpacklo_epi16(src_01, src_12);
317       s[1] = _mm256_unpacklo_epi16(src_23, src_34);
318       s[2] = _mm256_unpacklo_epi16(src_45, src_56);
319       s[3] = _mm256_unpacklo_epi16(src_67, src_78);
320       s[4] = _mm256_unpacklo_epi16(src_89, src_910);
321 
322       s[6] = _mm256_unpackhi_epi16(src_01, src_12);
323       s[7] = _mm256_unpackhi_epi16(src_23, src_34);
324       s[8] = _mm256_unpackhi_epi16(src_45, src_56);
325       s[9] = _mm256_unpackhi_epi16(src_67, src_78);
326       s[10] = _mm256_unpackhi_epi16(src_89, src_910);
327 
328       for (i = 0; i < h; i += 2) {
329         data = &src_ptr[i * src_stride + j];
330         const __m256i src_1011a = _mm256_permute2x128_si256(
331             src10,
332             _mm256_castsi128_si256(
333                 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
334             0x20);
335 
336         src10 = _mm256_castsi128_si256(
337             _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)));
338 
339         const __m256i src_1112a = _mm256_permute2x128_si256(
340             _mm256_castsi128_si256(
341                 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))),
342             src10, 0x20);
343 
344         const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero);
345         const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero);
346 
347         s[5] = _mm256_unpacklo_epi16(src_1011, src_1112);
348         s[11] = _mm256_unpackhi_epi16(src_1011, src_1112);
349 
350         const __m256i res_lo = convolve_12taps(s, coeffs);
351 
352         const __m256i res_32b_lo = _mm256_sra_epi32(
353             _mm256_add_epi32(res_lo, right_shift_const), right_shift);
354         // 8 bit conversion and saturation to uint8
355         __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
356         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
357 
358         if (w - j > 4) {
359           const __m256i res_hi = convolve_12taps(s + 6, coeffs);
360 
361           const __m256i res_32b_hi = _mm256_sra_epi32(
362               _mm256_add_epi32(res_hi, right_shift_const), right_shift);
363           __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi);
364           // 8 bit conversion and saturation to uint8
365           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
366 
367           __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi);
368 
369           const __m128i res_0 = _mm256_extracti128_si256(res_a, 0);
370           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
371 
372           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
373           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
374                            res_1);
375         } else {
376           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
377           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
378           if (w - j > 2) {
379             *(uint32_t *)&dst[i * dst_stride + j] =
380                 (uint32_t)_mm_cvtsi128_si32(res_0);
381             *(uint32_t *)&dst[i * dst_stride + j + dst_stride] =
382                 (uint32_t)_mm_cvtsi128_si32(res_1);
383           } else {
384             *(uint16_t *)&dst[i * dst_stride + j] =
385                 (uint16_t)_mm_cvtsi128_si32(res_0);
386             *(uint16_t *)&dst[i * dst_stride + j + dst_stride] =
387                 (uint16_t)_mm_cvtsi128_si32(res_1);
388           }
389         }
390         s[0] = s[1];
391         s[1] = s[2];
392         s[2] = s[3];
393         s[3] = s[4];
394         s[4] = s[5];
395 
396         s[6] = s[7];
397         s[7] = s[8];
398         s[8] = s[9];
399         s[9] = s[10];
400         s[10] = s[11];
401       }
402     }
403   } else {
404     const int fo_vert = filter_params_y->taps / 2 - 1;
405     const uint8_t *const src_ptr = src - fo_vert * src_stride;
406 
407     for (j = 0; j < w; j += 16) {
408       const uint8_t *data = &src_ptr[j];
409       __m256i src6;
410 
411       d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
412       d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
413       d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
414       d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
415       d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
416       d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
417       // Load lines a and b. Line a to lower 128, line b to upper 128
418       const __m256i src_01a = _mm256_permute2x128_si256(
419           _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
420 
421       const __m256i src_12a = _mm256_permute2x128_si256(
422           _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
423 
424       const __m256i src_23a = _mm256_permute2x128_si256(
425           _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
426 
427       const __m256i src_34a = _mm256_permute2x128_si256(
428           _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
429 
430       const __m256i src_45a = _mm256_permute2x128_si256(
431           _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
432 
433       src6 = _mm256_castsi128_si256(
434           _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
435       const __m256i src_56a =
436           _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
437 
438       s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
439       s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
440       s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
441 
442       s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
443       s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
444       s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
445 
446       for (i = 0; i < h; i += 2) {
447         data = &src_ptr[i * src_stride + j];
448         const __m256i src_67a = _mm256_permute2x128_si256(
449             src6,
450             _mm256_castsi128_si256(
451                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
452             0x20);
453 
454         src6 = _mm256_castsi128_si256(
455             _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
456         const __m256i src_78a = _mm256_permute2x128_si256(
457             _mm256_castsi128_si256(
458                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
459             src6, 0x20);
460 
461         s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
462         s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
463 
464         const __m256i res_lo = convolve_lowbd(s, coeffs);
465 
466         /* rounding code */
467         // shift by F - 1
468         const __m256i res_16b_lo = _mm256_sra_epi16(
469             _mm256_add_epi16(res_lo, right_shift_const), right_shift);
470         // 8 bit conversion and saturation to uint8
471         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
472 
473         if (w - j > 8) {
474           const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
475 
476           /* rounding code */
477           // shift by F - 1
478           const __m256i res_16b_hi = _mm256_sra_epi16(
479               _mm256_add_epi16(res_hi, right_shift_const), right_shift);
480           // 8 bit conversion and saturation to uint8
481           __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
482 
483           __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
484 
485           const __m128i res_0 = _mm256_castsi256_si128(res_a);
486           const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
487 
488           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
489           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
490                            res_1);
491         } else {
492           const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
493           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
494           if (w - j > 4) {
495             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
496             _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
497                              res_1);
498           } else if (w - j > 2) {
499             xx_storel_32(&dst[i * dst_stride + j], res_0);
500             xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
501           } else {
502             __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
503             __m128i *const p_1 =
504                 (__m128i *)&dst[i * dst_stride + j + dst_stride];
505             *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
506             *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
507           }
508         }
509         s[0] = s[1];
510         s[1] = s[2];
511         s[2] = s[3];
512 
513         s[4] = s[5];
514         s[5] = s[6];
515         s[6] = s[7];
516       }
517     }
518   }
519 }
520 
av1_convolve_x_sr_avx2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)521 void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
522                             int dst_stride, int w, int h,
523                             const InterpFilterParams *filter_params_x,
524                             const int subpel_x_qn,
525                             ConvolveParams *conv_params) {
526   const int bits = FILTER_BITS - conv_params->round_0;
527   const __m128i round_shift = _mm_cvtsi32_si128(bits);
528   __m256i round_0_const =
529       _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
530   __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
531   __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
532   int i, horiz_tap = SUBPEL_TAPS;
533 
534   assert(bits >= 0);
535   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
536          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
537   assert(conv_params->round_0 > 0);
538 
539   __m256i coeffs[6], filt[4];
540   filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
541   filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
542 
543   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
544       filter_params_x, subpel_x_qn & SUBPEL_MASK);
545   if (filter_params_x->taps == 12) {
546     horiz_tap = 12;
547   } else if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
548     horiz_tap = 4;
549   } else if (!(filter[0] | filter[7])) {
550     horiz_tap = 6;
551   }
552 
553   if (horiz_tap == 6)
554     prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs);
555   else if (horiz_tap == 12) {
556     prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs);
557   } else {
558     prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
559   }
560 
561   // horz_filt as 4 tap
562   if (horiz_tap == 4) {
563     const int fo_horiz = 1;
564     const uint8_t *const src_ptr = src - fo_horiz;
565     if (w <= 8) {
566       for (i = 0; i < h; i += 2) {
567         const __m256i data = _mm256_permute2x128_si256(
568             _mm256_castsi128_si256(
569                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
570             _mm256_castsi128_si256(_mm_loadu_si128(
571                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
572             0x20);
573 
574         __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
575 
576         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
577                                    round_0_shift);
578 
579         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
580                                    round_shift);
581 
582         /* rounding code */
583         // 8 bit conversion and saturation to uint8
584         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
585 
586         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
587         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
588 
589         if (w > 4) {
590           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
591           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
592         } else if (w > 2) {
593           xx_storel_32(&dst[i * dst_stride], res_0);
594           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
595         } else {
596           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
597           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
598           *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
599           *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
600         }
601       }
602     } else {
603       for (i = 0; i < h; ++i) {
604         for (int j = 0; j < w; j += 16) {
605           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
606           // 18 19 20 21 22 23
607           const __m256i data = _mm256_inserti128_si256(
608               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
609               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
610               1);
611 
612           __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
613 
614           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
615                                      round_0_shift);
616 
617           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
618                                      round_shift);
619 
620           /* rounding code */
621           // 8 bit conversion and saturation to uint8
622           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
623 
624           // Store values into the destination buffer
625           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
626           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
627           __m128i res = _mm256_castsi256_si128(res_8b);
628           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
629         }
630       }
631     }
632   } else if (horiz_tap == 6) {
633     const int fo_horiz = horiz_tap / 2 - 1;
634     const uint8_t *const src_ptr = src - fo_horiz;
635     filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
636     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
637 
638     if (w <= 8) {
639       for (i = 0; i < h; i += 2) {
640         const __m256i data = _mm256_permute2x128_si256(
641             _mm256_castsi128_si256(
642                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
643             _mm256_castsi128_si256(_mm_loadu_si128(
644                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
645             0x20);
646 
647         __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
648 
649         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
650                                    round_0_shift);
651 
652         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
653                                    round_shift);
654 
655         /* rounding code */
656         // 8 bit conversion and saturation to uint8
657         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
658 
659         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
660         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
661         if (w > 4) {
662           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
663           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
664         } else if (w > 2) {
665           xx_storel_32(&dst[i * dst_stride], res_0);
666           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
667         } else {
668           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
669           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
670           *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
671           *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
672         }
673       }
674     } else {
675       for (i = 0; i < h; ++i) {
676         for (int j = 0; j < w; j += 16) {
677           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
678           // 18 19 20 21 22 23
679           const __m256i data = _mm256_inserti128_si256(
680               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
681               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
682               1);
683 
684           __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt);
685 
686           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
687                                      round_0_shift);
688 
689           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
690                                      round_shift);
691 
692           /* rounding code */
693           // 8 bit conversion and saturation to uint8
694           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
695 
696           // Store values into the destination buffer
697           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
698           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
699           __m128i res = _mm256_castsi256_si128(res_8b);
700           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
701         }
702       }
703     }
704   } else if (horiz_tap == 12) {  // horiz_tap == 12
705     const int fo_horiz = filter_params_x->taps / 2 - 1;
706     const uint8_t *const src_ptr = src - fo_horiz;
707     const __m256i v_zero = _mm256_setzero_si256();
708     round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1);
709     round_const = _mm256_set1_epi32((1 << bits) >> 1);
710     round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
711     __m256i s[6];
712 
713     if (w <= 4) {
714       for (i = 0; i < h; i += 2) {
715         const __m256i data = _mm256_permute2x128_si256(
716             _mm256_castsi128_si256(
717                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
718             _mm256_castsi128_si256(_mm_loadu_si128(
719                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
720             0x20);
721         // row0 0..7 row1 0..7
722         const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);
723         // row0 8..F row1 8..F
724         const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);
725 
726         // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03
727         const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);
728         // row0 04 04 .. 07 07 row1 04 04 .. 07 07
729         const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);
730 
731         // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B
732         const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);
733         // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F
734         const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);
735 
736         // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14
737         s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);
738         // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
739         s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);
740         // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18
741         s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);
742         // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A
743         s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);
744         // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C
745         s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);
746         // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E
747         s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);
748 
749         const __m256i res_lo = convolve_12taps(s, coeffs);
750 
751         __m256i res_32b_lo = _mm256_sra_epi32(
752             _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
753 
754         // 00 01 02 03 10 12 13 14
755         res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const),
756                                       round_shift);
757         // 8 bit conversion and saturation to uint8
758         // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13
759         __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
760         // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
761         // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
762         __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
763 
764         // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03
765         const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
766         // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13
767         const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
768         if (w > 2) {
769           // 00 01 02 03
770           *(uint32_t *)&dst[i * dst_stride] =
771               (uint32_t)_mm_cvtsi128_si32(res_0);
772           // 10 11 12 13
773           *(uint32_t *)&dst[i * dst_stride + dst_stride] =
774               (uint32_t)_mm_cvtsi128_si32(res_1);
775         } else {
776           // 00 01
777           *(uint16_t *)&dst[i * dst_stride] =
778               (uint16_t)_mm_cvtsi128_si32(res_0);
779           // 10 11
780           *(uint16_t *)&dst[i * dst_stride + dst_stride] =
781               (uint16_t)_mm_cvtsi128_si32(res_1);
782         }
783       }
784     } else {
785       for (i = 0; i < h; i++) {
786         for (int j = 0; j < w; j += 8) {
787           const __m256i data = _mm256_permute2x128_si256(
788               _mm256_castsi128_si256(
789                   _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),
790               _mm256_castsi128_si256(_mm_loadu_si128(
791                   (__m128i *)(&src_ptr[i * src_stride + j + 4]))),
792               0x20);
793           // row0 0..7 4..B
794           const __m256i s_16l = _mm256_unpacklo_epi8(data, v_zero);
795           // row0 8..F C..13
796           const __m256i s_16h = _mm256_unpackhi_epi8(data, v_zero);
797 
798           // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07
799           const __m256i s_ll = _mm256_unpacklo_epi16(s_16l, s_16l);
800           // row0 04 04 .. 07 07 08 08 .. 0B 0B
801           const __m256i s_lh = _mm256_unpackhi_epi16(s_16l, s_16l);
802 
803           // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F
804           const __m256i s_hl = _mm256_unpacklo_epi16(s_16h, s_16h);
805           // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13
806           const __m256i s_hh = _mm256_unpackhi_epi16(s_16h, s_16h);
807 
808           s[0] = _mm256_alignr_epi8(s_lh, s_ll, 2);
809           s[1] = _mm256_alignr_epi8(s_lh, s_ll, 10);
810           s[2] = _mm256_alignr_epi8(s_hl, s_lh, 2);
811           s[3] = _mm256_alignr_epi8(s_hl, s_lh, 10);
812           s[4] = _mm256_alignr_epi8(s_hh, s_hl, 2);
813           s[5] = _mm256_alignr_epi8(s_hh, s_hl, 10);
814 
815           const __m256i res_lo = convolve_12taps(s, coeffs);
816 
817           __m256i res_32b_lo = _mm256_sra_epi32(
818               _mm256_add_epi32(res_lo, round_0_const), round_0_shift);
819 
820           res_32b_lo = _mm256_sra_epi32(
821               _mm256_add_epi32(res_32b_lo, round_const), round_shift);
822           // 8 bit conversion and saturation to uint8
823           __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);
824           __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
825           const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0);
826           const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
827           *(uint32_t *)&dst[i * dst_stride + j] =
828               (uint32_t)_mm_cvtsi128_si32(res_0);
829           *(uint32_t *)&dst[i * dst_stride + j + 4] =
830               (uint32_t)_mm_cvtsi128_si32(res_1);
831         }
832       }
833     }
834   } else {
835     const int fo_horiz = filter_params_x->taps / 2 - 1;
836     const uint8_t *const src_ptr = src - fo_horiz;
837     filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
838     filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
839 
840     if (w <= 8) {
841       for (i = 0; i < h; i += 2) {
842         const __m256i data = _mm256_permute2x128_si256(
843             _mm256_castsi128_si256(
844                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
845             _mm256_castsi128_si256(_mm_loadu_si128(
846                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
847             0x20);
848 
849         __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
850 
851         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
852                                    round_0_shift);
853 
854         res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
855                                    round_shift);
856 
857         /* rounding code */
858         // 8 bit conversion and saturation to uint8
859         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
860 
861         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
862         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
863         if (w > 4) {
864           _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
865           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
866         } else if (w > 2) {
867           xx_storel_32(&dst[i * dst_stride], res_0);
868           xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
869         } else {
870           __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
871           __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
872           *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
873           *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
874         }
875       }
876     } else {
877       for (i = 0; i < h; ++i) {
878         for (int j = 0; j < w; j += 16) {
879           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
880           // 18 19 20 21 22 23
881           const __m256i data = _mm256_inserti128_si256(
882               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
883               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
884               1);
885 
886           __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
887 
888           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
889                                      round_0_shift);
890 
891           res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
892                                      round_shift);
893 
894           /* rounding code */
895           // 8 bit conversion and saturation to uint8
896           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
897 
898           // Store values into the destination buffer
899           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
900           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
901           __m128i res = _mm256_castsi256_si128(res_8b);
902           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
903         }
904       }
905     }
906   }
907 }
908