1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "aom_dsp/inv_txfm.h"
15 #include "aom_ports/mem.h"
16 
aom_idct8x8_1_add_neon(int16_t * input,uint8_t * dest,int dest_stride)17 void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
18   uint8x8_t d2u8, d3u8, d30u8, d31u8;
19   uint64x1_t d2u64, d3u64, d4u64, d5u64;
20   uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
21   int16x8_t q0s16;
22   uint8_t *d1, *d2;
23   int16_t i, a1;
24   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
25   out = dct_const_round_shift(out * cospi_16_64);
26   a1 = ROUND_POWER_OF_TWO(out, 5);
27 
28   q0s16 = vdupq_n_s16(a1);
29   q0u16 = vreinterpretq_u16_s16(q0s16);
30 
31   d1 = d2 = dest;
32   for (i = 0; i < 2; i++) {
33     d2u64 = vld1_u64((const uint64_t *)d1);
34     d1 += dest_stride;
35     d3u64 = vld1_u64((const uint64_t *)d1);
36     d1 += dest_stride;
37     d4u64 = vld1_u64((const uint64_t *)d1);
38     d1 += dest_stride;
39     d5u64 = vld1_u64((const uint64_t *)d1);
40     d1 += dest_stride;
41 
42     q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
43     q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
44     q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
45     q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
46 
47     d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
48     d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
49     d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
50     d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
51 
52     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
53     d2 += dest_stride;
54     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
55     d2 += dest_stride;
56     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
57     d2 += dest_stride;
58     vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
59     d2 += dest_stride;
60   }
61   return;
62 }
63