1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <arm_neon.h>
33 #ifdef OPUS_CHECK_ASM
34 # include <string.h>
35 #endif
36 #include "main.h"
37 #include "stack_alloc.h"
38 
39 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
40 /* If there are more states, C function is called, and this optimization must be expanded. */
41 #define NEON_MAX_DEL_DEC_STATES 4
42 
43 typedef struct {
44     opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ];
45     opus_int32 RandState[ DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
46     opus_int32 Q_Q10[     DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
47     opus_int32 Xq_Q14[    DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
48     opus_int32 Pred_Q15[  DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
49     opus_int32 Shape_Q14[ DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
50     opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];
51     opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
52     opus_int32 Diff_Q14[  NEON_MAX_DEL_DEC_STATES ];
53     opus_int32 Seed[      NEON_MAX_DEL_DEC_STATES ];
54     opus_int32 SeedInit[  NEON_MAX_DEL_DEC_STATES ];
55     opus_int32 RD_Q10[    NEON_MAX_DEL_DEC_STATES ];
56 } NSQ_del_decs_struct;
57 
58 typedef struct {
59     opus_int32 Q_Q10[        NEON_MAX_DEL_DEC_STATES ];
60     opus_int32 RD_Q10[       NEON_MAX_DEL_DEC_STATES ];
61     opus_int32 xq_Q14[       NEON_MAX_DEL_DEC_STATES ];
62     opus_int32 LF_AR_Q14[    NEON_MAX_DEL_DEC_STATES ];
63     opus_int32 Diff_Q14[     NEON_MAX_DEL_DEC_STATES ];
64     opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];
65     opus_int32 LPC_exc_Q14[  NEON_MAX_DEL_DEC_STATES ];
66 } NSQ_samples_struct;
67 
68 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
69     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
70     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
71     NSQ_del_decs_struct psDelDec[],                 /* I/O  Delayed decision states             */
72     const opus_int16    x16[],                      /* I    Input                               */
73     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
74     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
75     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
76     opus_int            subfr,                      /* I    Subframe number                     */
77     const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
78     const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
79     const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
80     const opus_int      signal_type,                /* I    Signal type                         */
81     const opus_int      decisionDelay               /* I    Decision delay                      */
82 );
83 
84 /******************************************/
85 /* Noise shape quantizer for one subframe */
86 /******************************************/
87 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
88     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
89     NSQ_del_decs_struct psDelDec[],             /* I/O  Delayed decision states             */
90     opus_int            signalType,             /* I    Signal type                         */
91     const opus_int32    x_Q10[],                /* I                                        */
92     opus_int8           pulses[],               /* O                                        */
93     opus_int16          xq[],                   /* O                                        */
94     opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
95     opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
96     const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
97     const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
98     const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
99     opus_int            lag,                    /* I    Pitch lag                           */
100     opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
101     opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
102     opus_int32          LF_shp_Q14,             /* I                                        */
103     opus_int32          Gain_Q16,               /* I                                        */
104     opus_int            Lambda_Q10,             /* I                                        */
105     opus_int            offset_Q10,             /* I                                        */
106     opus_int            length,                 /* I    Input length                        */
107     opus_int            subfr,                  /* I    Subframe number                     */
108     opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
109     opus_int            predictLPCOrder,        /* I    Prediction filter order             */
110     opus_int            warping_Q16,            /* I                                        */
111     opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
112     opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
113     opus_int            decisionDelay           /* I                                        */
114 );
115 
copy_winner_state_kernel(const NSQ_del_decs_struct * psDelDec,const opus_int offset,const opus_int last_smple_idx,const opus_int Winner_ind,const int32x2_t gain_lo_s32x2,const int32x2_t gain_hi_s32x2,const int32x4_t shift_s32x4,int32x4_t t0_s32x4,int32x4_t t1_s32x4,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)116 static OPUS_INLINE void copy_winner_state_kernel(
117     const NSQ_del_decs_struct *psDelDec,
118     const opus_int            offset,
119     const opus_int            last_smple_idx,
120     const opus_int            Winner_ind,
121     const int32x2_t           gain_lo_s32x2,
122     const int32x2_t           gain_hi_s32x2,
123     const int32x4_t           shift_s32x4,
124     int32x4_t                 t0_s32x4,
125     int32x4_t                 t1_s32x4,
126     opus_int8 *const          pulses,
127     opus_int16                *pxq,
128     silk_nsq_state            *NSQ
129 )
130 {
131     int16x8_t t_s16x8;
132     int32x4_t o0_s32x4, o1_s32x4;
133 
134     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
135     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
136     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
137     t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
138     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
139     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
140     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
141     t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
142     t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) );
143     vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );
144 
145     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
146     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
147     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
148     t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
149     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
150     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
151     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
152     t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
153     o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );
154     o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );
155     o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );
156     o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );
157     o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );
158     o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );
159     vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );
160     vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );
161 
162     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
163     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
164     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
165     t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
166     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
167     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
168     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
169     t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
170     vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 );
171     vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 );
172 }
173 
copy_winner_state(const NSQ_del_decs_struct * psDelDec,const opus_int decisionDelay,const opus_int smpl_buf_idx,const opus_int Winner_ind,const opus_int32 gain,const opus_int32 shift,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)174 static OPUS_INLINE void copy_winner_state(
175     const NSQ_del_decs_struct *psDelDec,
176     const opus_int            decisionDelay,
177     const opus_int            smpl_buf_idx,
178     const opus_int            Winner_ind,
179     const opus_int32          gain,
180     const opus_int32          shift,
181     opus_int8 *const          pulses,
182     opus_int16                *pxq,
183     silk_nsq_state            *NSQ
184 )
185 {
186     opus_int        i, last_smple_idx;
187     const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) );
188     const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );
189     const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );
190     int32x4_t       t0_s32x4, t1_s32x4;
191 
192     t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */
193     last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;
194     if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
195     if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
196 
197     for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) {
198         copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
199     }
200     for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) {
201         pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
202         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
203         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
204     }
205 
206     last_smple_idx += DECISION_DELAY;
207     for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {
208         copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
209     }
210     for( ; i < decisionDelay; i++, last_smple_idx-- ) {
211         pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
212         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
213         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
214     }
215 }
216 
silk_NSQ_del_dec_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 PredCoef_Q12[2* MAX_LPC_ORDER],const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)217 void silk_NSQ_del_dec_neon(
218     const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
219     silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
220     SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
221     const opus_int16            x16[],                                      /* I    Input                           */
222     opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
223     const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
224     const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
225     const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
226     const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
227     const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
228     const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
229     const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
230     const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
231     const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
232     const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
233 )
234 {
235 #ifdef OPUS_CHECK_ASM
236     silk_nsq_state NSQ_c;
237     SideInfoIndices psIndices_c;
238     opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
239     const opus_int8 *const pulses_a = pulses;
240 
241     ( void )pulses_a;
242     silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
243     silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
244     silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );
245     silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
246                        pitchL, Lambda_Q10, LTP_scale_Q14 );
247 #endif
248 
249     /* The optimization parallelizes the different delay decision states. */
250     if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) {
251         /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
252         /* If there are more states, C function is called, and this optimization must be expanded. */
253         /* When the number of delay decision states is less than 3, there are penalties using this */
254         /* optimization, and C function is called.                                                 */
255         /* When the number of delay decision states is 2, it's better to specialize another        */
256         /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority)  */
257         silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14,
258             Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 );
259     } else {
260         opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
261         opus_int            smpl_buf_idx, decisionDelay;
262         const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
263         opus_int16          *pxq;
264         VARDECL( opus_int32, sLTP_Q15 );
265         VARDECL( opus_int16, sLTP );
266         opus_int32          HarmShapeFIRPacked_Q14;
267         opus_int            offset_Q10;
268         opus_int32          RDmin_Q10, Gain_Q10;
269         VARDECL( opus_int32, x_sc_Q10 );
270         VARDECL( opus_int32, delayedGain_Q10 );
271         VARDECL( NSQ_del_decs_struct, psDelDec );
272         int32x4_t           t_s32x4;
273         SAVE_STACK;
274 
275         /* Set unvoiced lag to the previous one, overwrite later for voiced */
276         lag = NSQ->lagPrev;
277 
278         silk_assert( NSQ->prev_gain_Q16 != 0 );
279 
280         /* Initialize delayed decision states */
281         ALLOC( psDelDec, 1, NSQ_del_decs_struct );
282         /* Only RandState and RD_Q10 need to be initialized to 0. */
283         silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
284         vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
285 
286         for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
287             psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3;
288         }
289         vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );
290         vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );
291         vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );
292         for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
293             vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );
294         }
295         for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
296             vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );
297         }
298 
299         offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
300         smpl_buf_idx = 0; /* index of oldest samples */
301 
302         decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
303 
304         /* For voiced frames limit the decision delay to lower than the pitch lag */
305         if( psIndices->signalType == TYPE_VOICED ) {
306             opus_int pitch_min = pitchL[ 0 ];
307             for( k = 1; k < psEncC->nb_subfr; k++ ) {
308                 pitch_min = silk_min_int( pitch_min, pitchL[ k ] );
309             }
310             decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );
311         } else {
312             if( lag > 0 ) {
313                 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
314             }
315         }
316 
317         if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
318             LSF_interpolation_flag = 0;
319         } else {
320             LSF_interpolation_flag = 1;
321         }
322 
323         ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
324         ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
325         ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
326         ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
327         /* Set up pointers to start of sub frame */
328         pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
329         NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
330         NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
331         subfr = 0;
332         for( k = 0; k < psEncC->nb_subfr; k++ ) {
333             A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
334             B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
335             AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];
336 
337             /* Noise shape parameters */
338             silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
339             HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
340             HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
341 
342             NSQ->rewhite_flag = 0;
343             if( psIndices->signalType == TYPE_VOICED ) {
344                 /* Voiced */
345                 lag = pitchL[ k ];
346 
347                 /* Re-whitening */
348                 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
349                     if( k == 2 ) {
350                         /* RESET DELAYED DECISIONS */
351                         /* Find winner */
352                         int32x4_t RD_Q10_s32x4;
353                         RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
354                         Winner_ind = 0;
355                         for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
356                             if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {
357                                 RDmin_Q10 = psDelDec->RD_Q10[ i ];
358                                 Winner_ind = i;
359                             }
360                         }
361                         psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );
362                         RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
363                         RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) );
364                         vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );
365 
366                         /* Copy final part of signals from winner state to output and long-term filter states */
367                         copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );
368 
369                         subfr = 0;
370                     }
371 
372                     /* Rewhiten with new A coefs */
373                     start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
374                     silk_assert( start_idx > 0 );
375 
376                     silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
377                         A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
378 
379                     NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
380                     NSQ->rewhite_flag = 1;
381                 }
382             }
383 
384             silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
385                 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
386 
387             silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
388                 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
389                 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
390                 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
391 
392             x16    += psEncC->subfr_length;
393             pulses += psEncC->subfr_length;
394             pxq    += psEncC->subfr_length;
395         }
396 
397         /* Find winner */
398         RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
399         Winner_ind = 0;
400         for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
401             if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {
402                 RDmin_Q10 = psDelDec->RD_Q10[ k ];
403                 Winner_ind = k;
404             }
405         }
406 
407         /* Copy final part of signals from winner state to output and long-term filter states */
408         psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];
409         Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
410         copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ );
411 
412         t_s32x4 = vdupq_n_s32( 0 ); /* initialization */
413         for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {
414             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
415             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
416             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
417             t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
418             vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );
419         }
420 
421         for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {
422           NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];
423         }
424 
425         for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) {
426             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
427             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
428             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
429             t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
430             vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );
431         }
432 
433         for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
434           NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];
435         }
436 
437         /* Update states */
438         NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];
439         NSQ->sDiff_shp_Q14  = psDelDec->Diff_Q14[ Winner_ind ];
440         NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
441 
442         /* Save quantized speech signal */
443         silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
444         silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
445         RESTORE_STACK;
446     }
447 
448 #ifdef OPUS_CHECK_ASM
449     silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
450     silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
451     silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );
452 #endif
453 }
454 
455 /******************************************/
456 /* Noise shape quantizer for one subframe */
457 /******************************************/
458 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */
459 /*       Therefore here we append "_local" to the NEON function name to avoid confusion.                                */
silk_short_prediction_create_arch_coef_neon_local(opus_int32 * out,const opus_int16 * in,opus_int order)460 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order)
461 {
462   int16x8_t t_s16x8;
463   int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;
464   silk_assert( order == 10 || order == 16 );
465 
466   t_s16x8 = vld1q_s16( in + 0 );                                                   /* 7 6 5 4  3 2 1 0    */
467   t_s16x8 = vrev64q_s16( t_s16x8 );                                                /* 4 5 6 7  0 1 2 3    */
468   t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 );                          /* 4 5 6 7             */
469   t3_s32x4 = vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 );                          /* 0 1 2 3             */
470 
471   if( order == 16 ) {
472       t_s16x8 = vld1q_s16( in + 8 );                                               /* F E D C  B A 9 8    */
473       t_s16x8 = vrev64q_s16( t_s16x8 );                                            /* C D E F  8 9 A B    */
474       t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 );                      /* C D E F             */
475       t1_s32x4 = vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 );                      /* 8 9 A B             */
476   } else {
477       int16x4_t t_s16x4;
478 
479       t0_s32x4 = vdupq_n_s32( 0 );                                                 /* zero zero zero zero */
480       t_s16x4 = vld1_s16( in + 6 );                                                /* 9    8    7    6    */
481       t_s16x4 = vrev64_s16( t_s16x4 );                                             /* 6    7    8    9    */
482       t1_s32x4 = vshll_n_s16( t_s16x4, 15 );
483       t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8    9    zero zero */
484   }
485   vst1q_s32( out +  0, t0_s32x4 );
486   vst1q_s32( out +  4, t1_s32x4 );
487   vst1q_s32( out +  8, t2_s32x4 );
488   vst1q_s32( out + 12, t3_s32x4 );
489 }
490 
silk_SMLAWB_lane0_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)491 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(
492     const int32x4_t out_s32x4,
493     const int32x4_t in_s32x4,
494     const int32x2_t coef_s32x2
495 )
496 {
497     return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );
498 }
499 
silk_SMLAWB_lane1_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)500 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(
501     const int32x4_t out_s32x4,
502     const int32x4_t in_s32x4,
503     const int32x2_t coef_s32x2
504 )
505 {
506     return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );
507 }
508 
509 /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */
510 /*       Therefore here we append "_local" to the function name to avoid confusion.                        */
silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 * buf32,const opus_int32 * a_Q12_arch,opus_int order)511 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order)
512 {
513     const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );
514     const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );
515     const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );
516     const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );
517     int32x4_t LPC_pred_Q14_s32x4;
518 
519     silk_assert( order == 10 || order == 16 );
520     /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
521     LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );
522     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch0_s32x4 ) );
523     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch0_s32x4 ) );
524     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
525     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
526     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch1_s32x4 ) );
527     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch1_s32x4 ) );
528     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
529     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
530     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch2_s32x4 ) );
531     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch2_s32x4 ) );
532     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
533     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
534     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch3_s32x4 ) );
535     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch3_s32x4 ) );
536     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
537     LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
538 
539     return LPC_pred_Q14_s32x4;
540 }
541 
silk_noise_shape_quantizer_del_dec_neon(silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,opus_int nStatesDelayedDecision,opus_int * smpl_buf_idx,opus_int decisionDelay)542 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
543     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
544     NSQ_del_decs_struct psDelDec[],             /* I/O  Delayed decision states             */
545     opus_int            signalType,             /* I    Signal type                         */
546     const opus_int32    x_Q10[],                /* I                                        */
547     opus_int8           pulses[],               /* O                                        */
548     opus_int16          xq[],                   /* O                                        */
549     opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
550     opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
551     const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
552     const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
553     const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
554     opus_int            lag,                    /* I    Pitch lag                           */
555     opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
556     opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
557     opus_int32          LF_shp_Q14,             /* I                                        */
558     opus_int32          Gain_Q16,               /* I                                        */
559     opus_int            Lambda_Q10,             /* I                                        */
560     opus_int            offset_Q10,             /* I                                        */
561     opus_int            length,                 /* I    Input length                        */
562     opus_int            subfr,                  /* I    Subframe number                     */
563     opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
564     opus_int            predictLPCOrder,        /* I    Prediction filter order             */
565     opus_int            warping_Q16,            /* I                                        */
566     opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
567     opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
568     opus_int            decisionDelay           /* I                                        */
569 )
570 {
571     opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
572     opus_int32   Winner_rand_state;
573     opus_int32   LTP_pred_Q14, n_LTP_Q14;
574     opus_int32   RDmin_Q10, RDmax_Q10;
575     opus_int32   Gain_Q10;
576     opus_int32   *pred_lag_ptr, *shp_lag_ptr;
577     opus_int32   a_Q12_arch[MAX_LPC_ORDER];
578     const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );
579     const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;
580     opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];
581     const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );
582     const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );
583 
584     VARDECL( NSQ_samples_struct, psSampleState );
585     SAVE_STACK;
586 
587     silk_assert( nStatesDelayedDecision > 0 );
588     silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
589     ALLOC( psSampleState, 2, NSQ_samples_struct );
590 
591     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
592     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
593     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
594 
595     for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {
596       const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 +  i );
597       vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 ) );
598       vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );
599     }
600 
601     for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {
602       AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );
603     }
604 
605     silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder );
606 
607     for( i = 0; i < length; i++ ) {
608         int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;
609         int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;
610         int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;
611         int32x2_t AR_shp_Q28_s32x2;
612         int16x4_t r_Q10_s16x4, rr_Q10_s16x4;
613 
614         /* Perform common calculations used in all states */
615 
616         /* Long-term prediction */
617         if( signalType == TYPE_VOICED ) {
618             /* Unrolled loop */
619             /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
620             LTP_pred_Q14 = 2;
621             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[  0 ], b_Q14[ 0 ] );
622             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
623             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
624             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
625             LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
626             LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
627             pred_lag_ptr++;
628         } else {
629             LTP_pred_Q14 = 0;
630         }
631 
632         /* Long-term shaping */
633         if( lag > 0 ) {
634             /* Symmetric, packed FIR coefficients */
635             n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
636             n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
637             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
638             shp_lag_ptr++;
639         } else {
640             n_LTP_Q14 = 0;
641         }
642 
643         /* Generate dither */
644         Seed_s32x4 = vld1q_s32( psDelDec->Seed );
645         Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );
646         vst1q_s32( psDelDec->Seed, Seed_s32x4 );
647 
648         /* Short-term prediction */
649         LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder);
650         LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */
651 
652         /* Noise shape feedback */
653         /* Output of lowpass section */
654         tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );
655         /* Output of allpass section */
656         tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );
657         tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
658         vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );
659         AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );
660         n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
661 
662         /* Loop over allpass sections */
663         for( j = 2; j < shapingLPCOrder; j += 2 ) {
664             /* Output of allpass section */
665             tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 );
666             tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );
667             vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );
668             n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
669             /* Output of allpass section */
670             tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 );
671             tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
672             vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );
673             AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );
674             n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
675         }
676         vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );
677         n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
678         n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 );                                                                                        /* Q11 -> Q12 */
679         n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) );     /* Q12 */
680         n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 );                                                                                        /* Q12 -> Q14 */
681         n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 );                                         /* Q12 */
682         n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */
683         n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 );                                                                                        /* Q12 -> Q14 */
684 
685         /* Input minus prediction plus noise feedback                       */
686         /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
687         tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 );               /* Q14 */
688         tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */
689         tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 );                       /* Q13 */
690         tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 );                             /* Q10 */
691         tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 );        /* residual error Q10 */
692 
693         /* Flip sign depending on dither */
694         sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );
695         tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );
696         tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );
697         tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );
698         tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
699         r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
700 
701         /* Find two quantization level candidates and measure their rate-distortion */
702         {
703             int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
704             int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
705             int16x4_t q2_Q10_s16x4;
706             int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
707             uint32x4_t t_u32x4;
708 
709             if( Lambda_Q10 > 2048 ) {
710                 /* For aggressive RDO, the bias becomes more than one pulse. */
711                 const int rdo_offset = Lambda_Q10/2 - 512;
712                 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
713                 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
714                 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
715                 silk_assert( Lambda_Q10 <= 32767 );
716 
717                 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
718                 q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
719                 q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
720                 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
721             }
722             {
723                 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
724                 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
725                 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
726                 int16x4_t tmp1_s16x4, tmp2_s16x4;
727 
728                 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
729                 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
730                 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
731                 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
732                 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
733                 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
734                 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );
735                 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );
736                 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 );
737                 tmp1_s16x4 = q1_Q10_s16x4;
738                 tmp2_s16x4 = q2_Q10_s16x4;
739                 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );
740                 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 );
741                 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );
742                 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );
743             }
744 
745             rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );
746             rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
747             rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );
748 
749             rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );
750             rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
751             rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );
752 
753             tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
754             tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
755             tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
756             vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
757             vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
758             t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );
759             tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) );
760             tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) );
761             vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );
762             vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );
763         }
764 
765         {
766             /* Update states for best quantization */
767             int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4;
768 
769             /* Quantized excitation */
770             exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );
771             exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
772             exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
773 
774             /* Add predictions */
775             LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
776             xq_Q14_s32x4      = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
777 
778             /* Update states */
779             tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
780             vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );
781             sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
782             vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
783             vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
784             vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
785             vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );
786 
787             /* Quantized excitation */
788             exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );
789             exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
790             exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
791 
792             /* Add predictions */
793             LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
794             xq_Q14_s32x4      = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
795 
796             /* Update states */
797             tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
798             vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );
799             sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
800             vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
801             vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
802             vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
803             vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );
804         }
805 
806         *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);
807         last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;
808         if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
809         if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
810 
811         /* Find winner */
812         RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
813         Winner_ind = 0;
814         for( k = 1; k < nStatesDelayedDecision; k++ ) {
815             if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {
816                 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
817                 Winner_ind = k;
818             }
819         }
820 
821         /* Increase RD values of expired states */
822         {
823             uint32x4_t t_u32x4;
824             Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ];
825             t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );
826             t_u32x4 = vmvnq_u32( t_u32x4 );
827             t_u32x4 = vshrq_n_u32( t_u32x4, 5 );
828             tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );
829             tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );
830             tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
831             tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
832             vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
833             vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
834 
835             /* Find worst in first set and best in second set */
836             RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
837             RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];
838             RDmax_ind = 0;
839             RDmin_ind = 0;
840             for( k = 1; k < nStatesDelayedDecision; k++ ) {
841                 /* find worst in first set */
842                 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {
843                     RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
844                     RDmax_ind = k;
845                 }
846                 /* find best in second set */
847                 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {
848                     RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];
849                     RDmin_ind = k;
850                 }
851             }
852         }
853 
854         /* Replace a state if best from second set outperforms worst in first set */
855         if( RDmin_Q10 < RDmax_Q10 ) {
856             opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;
857             const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )
858                 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );
859             /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several     */
860             /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */
861             /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity.                             */
862             for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {
863                 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];
864             }
865             for( j = 0; j < numOthers; j++ ) {
866                 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];
867             }
868 
869             psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ];
870             psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];
871             psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];
872             psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ];
873             psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ];
874             psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ];
875             psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ];
876         }
877 
878         /* Write samples from winner to output and long-term filter states */
879         if( subfr > 0 || i >= decisionDelay ) {
880             pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
881             xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
882                 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
883             NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
884             sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDelDec->Pred_Q15[  last_smple_idx ][ Winner_ind ];
885         }
886         NSQ->sLTP_shp_buf_idx++;
887         NSQ->sLTP_buf_idx++;
888 
889         /* Update states */
890         vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );
891         vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );
892         vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
893         vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
894         tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );
895         vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );
896         vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
897         vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
898         tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
899         tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 );
900         vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
901         vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
902         vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
903         delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
904     }
905     /* Update LPC states */
906     silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
907 
908     RESTORE_STACK;
909 }
910 
silk_SMULWB_8_neon(const opus_int16 * a,const int32x2_t b,opus_int32 * o)911 static OPUS_INLINE void silk_SMULWB_8_neon(
912     const opus_int16 *a,
913     const int32x2_t  b,
914     opus_int32       *o
915 )
916 {
917     const int16x8_t a_s16x8 = vld1q_s16( a );
918     int32x4_t o0_s32x4, o1_s32x4;
919 
920     o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );
921     o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );
922     o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );
923     o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );
924     vst1q_s32( o, o0_s32x4 );
925     vst1q_s32( o + 4, o1_s32x4 );
926 }
927 
928 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_4_neon(opus_int32 * a,const int32x2_t b_s32x2)929 static OPUS_INLINE void silk_SMULWW_small_b_4_neon(
930     opus_int32       *a,
931     const int32x2_t  b_s32x2)
932 {
933     int32x4_t o_s32x4;
934 
935     o_s32x4 = vld1q_s32( a );
936     o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );
937     vst1q_s32( a, o_s32x4 );
938 }
939 
940 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_8_neon(opus_int32 * a,const int32x2_t b_s32x2)941 static OPUS_INLINE void silk_SMULWW_small_b_8_neon(
942     opus_int32       *a,
943     const int32x2_t  b_s32x2
944 )
945 {
946     int32x4_t o0_s32x4, o1_s32x4;
947 
948     o0_s32x4 = vld1q_s32( a );
949     o1_s32x4 = vld1q_s32( a + 4 );
950     o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );
951     o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );
952     vst1q_s32( a, o0_s32x4 );
953     vst1q_s32( a + 4, o1_s32x4 );
954 }
955 
silk_SMULWW_4_neon(opus_int32 * a,const int32x2_t b_s32x2)956 static OPUS_INLINE void silk_SMULWW_4_neon(
957     opus_int32       *a,
958     const int32x2_t  b_s32x2)
959 {
960     int32x4_t a_s32x4, o_s32x4;
961 
962     a_s32x4 = vld1q_s32( a );
963     o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );
964     o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );
965     vst1q_s32( a, o_s32x4 );
966 }
967 
silk_SMULWW_8_neon(opus_int32 * a,const int32x2_t b_s32x2)968 static OPUS_INLINE void silk_SMULWW_8_neon(
969     opus_int32       *a,
970     const int32x2_t  b_s32x2
971 )
972 {
973     int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;
974 
975     a0_s32x4 = vld1q_s32( a );
976     a1_s32x4 = vld1q_s32( a + 4 );
977     o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );
978     o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );
979     o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );
980     o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );
981     vst1q_s32( a, o0_s32x4 );
982     vst1q_s32( a + 4, o1_s32x4 );
983 }
984 
silk_SMULWW_loop_neon(const opus_int16 * a,const opus_int32 b,opus_int32 * o,const opus_int loop_num)985 static OPUS_INLINE void silk_SMULWW_loop_neon(
986     const opus_int16 *a,
987     const opus_int32 b,
988     opus_int32       *o,
989     const opus_int   loop_num
990 )
991 {
992     opus_int i;
993     int32x2_t b_s32x2;
994 
995     b_s32x2 = vdup_n_s32( b );
996     for( i = 0; i < loop_num - 7; i += 8 ) {
997         silk_SMULWB_8_neon( a + i, b_s32x2, o + i );
998     }
999     for( ; i < loop_num; i++ ) {
1000         o[ i ] = silk_SMULWW( a[ i ], b );
1001     }
1002 }
1003 
silk_nsq_del_dec_scale_states_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],const opus_int16 x16[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)1004 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
1005     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
1006     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
1007     NSQ_del_decs_struct psDelDec[],                 /* I/O  Delayed decision states             */
1008     const opus_int16    x16[],                      /* I    Input                               */
1009     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
1010     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
1011     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
1012     opus_int            subfr,                      /* I    Subframe number                     */
1013     const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
1014     const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
1015     const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
1016     const opus_int      signal_type,                /* I    Signal type                         */
1017     const opus_int      decisionDelay               /* I    Decision delay                      */
1018 )
1019 {
1020     opus_int            i, lag;
1021     opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
1022 
1023     lag          = pitchL[ subfr ];
1024     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
1025     silk_assert( inv_gain_Q31 != 0 );
1026 
1027     /* Scale input */
1028     inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
1029     silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );
1030 
1031     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
1032     if( NSQ->rewhite_flag ) {
1033         if( subfr == 0 ) {
1034             /* Do LTP downscaling */
1035             inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
1036         }
1037         silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );
1038     }
1039 
1040     /* Adjust for changing gain */
1041     if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
1042         int32x2_t gain_adj_Q16_s32x2;
1043         gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
1044 
1045         /* Scale long-term shaping state */
1046         if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {
1047             gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) );
1048             for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1049                 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1050             }
1051             for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1052                 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1053             }
1054 
1055             /* Scale long-term prediction state */
1056             if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1057                 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1058                     silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1059                 }
1060                 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1061                     sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1062                 }
1063             }
1064 
1065             /* Scale scalar states */
1066             silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1067             silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14,  gain_adj_Q16_s32x2 );
1068 
1069             /* Scale short-term prediction and shaping states */
1070             for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1071                 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1072             }
1073 
1074             for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1075                 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1076             }
1077 
1078             for( i = 0; i < DECISION_DELAY; i++ ) {
1079                 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[  i ], gain_adj_Q16_s32x2 );
1080                 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1081             }
1082         } else {
1083             gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) );
1084             gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 );
1085             for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1086                 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1087             }
1088             for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1089                 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1090             }
1091 
1092             /* Scale long-term prediction state */
1093             if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1094                 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1095                     silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1096                 }
1097                 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1098                     sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1099                 }
1100             }
1101 
1102             /* Scale scalar states */
1103             silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1104             silk_SMULWW_4_neon( psDelDec->Diff_Q14,  gain_adj_Q16_s32x2 );
1105 
1106             /* Scale short-term prediction and shaping states */
1107             for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1108                 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1109             }
1110 
1111             for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1112                 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1113             }
1114 
1115             for( i = 0; i < DECISION_DELAY; i++ ) {
1116                 silk_SMULWW_4_neon( psDelDec->Pred_Q15[  i ], gain_adj_Q16_s32x2 );
1117                 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1118             }
1119         }
1120 
1121         /* Save inverse gain */
1122         NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
1123     }
1124 }
1125