1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
12 #define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
13 
14 #include <arm_neon.h>
15 
16 #include "./vpx_config.h"
17 #include "./vpx_dsp_rtcd.h"
18 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)19 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
20                                uint8x8_t *const s0, uint8x8_t *const s1,
21                                uint8x8_t *const s2, uint8x8_t *const s3) {
22   *s0 = vld1_u8(s);
23   s += p;
24   *s1 = vld1_u8(s);
25   s += p;
26   *s2 = vld1_u8(s);
27   s += p;
28   *s3 = vld1_u8(s);
29 }
30 
load_u8_8x8(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)31 static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
32                                uint8x8_t *const s0, uint8x8_t *const s1,
33                                uint8x8_t *const s2, uint8x8_t *const s3,
34                                uint8x8_t *const s4, uint8x8_t *const s5,
35                                uint8x8_t *const s6, uint8x8_t *const s7) {
36   *s0 = vld1_u8(s);
37   s += p;
38   *s1 = vld1_u8(s);
39   s += p;
40   *s2 = vld1_u8(s);
41   s += p;
42   *s3 = vld1_u8(s);
43   s += p;
44   *s4 = vld1_u8(s);
45   s += p;
46   *s5 = vld1_u8(s);
47   s += p;
48   *s6 = vld1_u8(s);
49   s += p;
50   *s7 = vld1_u8(s);
51 }
52 
load_u8_16x8(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)53 static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
54                                 uint8x16_t *const s0, uint8x16_t *const s1,
55                                 uint8x16_t *const s2, uint8x16_t *const s3,
56                                 uint8x16_t *const s4, uint8x16_t *const s5,
57                                 uint8x16_t *const s6, uint8x16_t *const s7) {
58   *s0 = vld1q_u8(s);
59   s += p;
60   *s1 = vld1q_u8(s);
61   s += p;
62   *s2 = vld1q_u8(s);
63   s += p;
64   *s3 = vld1q_u8(s);
65   s += p;
66   *s4 = vld1q_u8(s);
67   s += p;
68   *s5 = vld1q_u8(s);
69   s += p;
70   *s6 = vld1q_u8(s);
71   s += p;
72   *s7 = vld1q_u8(s);
73 }
74 
convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters,const int16x4_t filter3,const int16x4_t filter4)75 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
76                                     const int16x4_t s2, const int16x4_t s3,
77                                     const int16x4_t s4, const int16x4_t s5,
78                                     const int16x4_t s6, const int16x4_t s7,
79                                     const int16x8_t filters,
80                                     const int16x4_t filter3,
81                                     const int16x4_t filter4) {
82   const int16x4_t filters_lo = vget_low_s16(filters);
83   const int16x4_t filters_hi = vget_high_s16(filters);
84   int16x4_t sum;
85 
86   sum = vmul_lane_s16(s0, filters_lo, 0);
87   sum = vmla_lane_s16(sum, s1, filters_lo, 1);
88   sum = vmla_lane_s16(sum, s2, filters_lo, 2);
89   sum = vmla_lane_s16(sum, s5, filters_hi, 1);
90   sum = vmla_lane_s16(sum, s6, filters_hi, 2);
91   sum = vmla_lane_s16(sum, s7, filters_hi, 3);
92   sum = vqadd_s16(sum, vmul_s16(s3, filter3));
93   sum = vqadd_s16(sum, vmul_s16(s4, filter4));
94   return sum;
95 }
96 
convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const int16x8_t filter3,const int16x8_t filter4)97 static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
98                                     const int16x8_t s2, const int16x8_t s3,
99                                     const int16x8_t s4, const int16x8_t s5,
100                                     const int16x8_t s6, const int16x8_t s7,
101                                     const int16x8_t filters,
102                                     const int16x8_t filter3,
103                                     const int16x8_t filter4) {
104   const int16x4_t filters_lo = vget_low_s16(filters);
105   const int16x4_t filters_hi = vget_high_s16(filters);
106   int16x8_t sum;
107 
108   sum = vmulq_lane_s16(s0, filters_lo, 0);
109   sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
110   sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
111   sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
112   sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
113   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
114   sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
115   sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
116   return vqrshrun_n_s16(sum, 7);
117 }
118 
scale_filter_8(const uint8x8_t * const s,const int16x8_t filters)119 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
120                                        const int16x8_t filters) {
121   const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
122   const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
123   int16x8_t ss[8];
124 
125   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
126   ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
127   ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
128   ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
129   ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
130   ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
131   ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
132   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
133 
134   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
135                      filters, filter3, filter4);
136 }
137 
138 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
139