1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date:        19. March 2015
5 * $Revision: 	V.1.4.5
6 *
7 * Project: 	    CMSIS DSP Library
8 * Title:	    arm_dct4_q15.c
9 *
10 * Description:	Processing function of DCT4 & IDCT4 Q15.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *   - Redistributions of source code must retain the above copyright
18 *     notice, this list of conditions and the following disclaimer.
19 *   - Redistributions in binary form must reproduce the above copyright
20 *     notice, this list of conditions and the following disclaimer in
21 *     the documentation and/or other materials provided with the
22 *     distribution.
23 *   - Neither the name of ARM LIMITED nor the names of its contributors
24 *     may be used to endorse or promote products derived from this
25 *     software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
43 /**
44  * @addtogroup DCT4_IDCT4
45  * @{
46  */
47 
48 /**
49  * @brief Processing function for the Q15 DCT4/IDCT4.
50  * @param[in]       *S             points to an instance of the Q15 DCT4 structure.
51  * @param[in]       *pState        points to state buffer.
52  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
53  * @return none.
54  *
55  * \par Input an output formats:
56  * Internally inputs are downscaled in the RFFT process function to avoid overflows.
57  * Number of bits downscaled, depends on the size of the transform.
58  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
59  *
60  * \image html dct4FormatsQ15Table.gif
61  */
62 
arm_dct4_q15(const arm_dct4_instance_q15 * S,q15_t * pState,q15_t * pInlineBuffer)63 void arm_dct4_q15(
64   const arm_dct4_instance_q15 * S,
65   q15_t * pState,
66   q15_t * pInlineBuffer)
67 {
68   uint32_t i;                                    /* Loop counter */
69   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
70   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
71   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
72   q15_t in;                                      /* Temporary variable */
73 
74 
75   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
76    * along with some pre-processing and post-processing.
77    * Computational procedure is explained as follows:
78    * (a) Pre-processing involves multiplying input with cos factor,
79    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
80    *              where,
81    *                 r(n) -- output of preprocessing
82    *                 u(n) -- input to preprocessing(actual Source buffer)
83    * (b) Calculation of DCT2 using FFT is divided into three steps:
84    *                  Step1: Re-ordering of even and odd elements of input.
85    *                  Step2: Calculating FFT of the re-ordered input.
86    *                  Step3: Taking the real part of the product of FFT output and weights.
87    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
88    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
89    *                        where,
90    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
91    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
92    */
93 
94         /*-------- Pre-processing ------------*/
95   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
96   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N);
97   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N);
98 
99   /* ----------------------------------------------------------------
100    * Step1: Re-ordering of even and odd elements as
101    *             pState[i] =  pInlineBuffer[2*i] and
102    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
103    ---------------------------------------------------------------------*/
104 
105   /* pS1 initialized to pState */
106   pS1 = pState;
107 
108   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
109   pS2 = pState + (S->N - 1u);
110 
111   /* pbuff initialized to input buffer */
112   pbuff = pInlineBuffer;
113 
114 
115 #ifndef ARM_MATH_CM0_FAMILY
116 
117   /* Run the below code for Cortex-M4 and Cortex-M3 */
118 
119   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
120   i = (uint32_t) S->Nby2 >> 2u;
121 
122   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
123    ** a second loop below computes the remaining 1 to 3 samples. */
124   do
125   {
126     /* Re-ordering of even and odd elements */
127     /* pState[i] =  pInlineBuffer[2*i] */
128     *pS1++ = *pbuff++;
129     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
130     *pS2-- = *pbuff++;
131 
132     *pS1++ = *pbuff++;
133     *pS2-- = *pbuff++;
134 
135     *pS1++ = *pbuff++;
136     *pS2-- = *pbuff++;
137 
138     *pS1++ = *pbuff++;
139     *pS2-- = *pbuff++;
140 
141     /* Decrement the loop counter */
142     i--;
143   } while(i > 0u);
144 
145   /* pbuff initialized to input buffer */
146   pbuff = pInlineBuffer;
147 
148   /* pS1 initialized to pState */
149   pS1 = pState;
150 
151   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
152   i = (uint32_t) S->N >> 2u;
153 
154   /* Processing with loop unrolling 4 times as N is always multiple of 4.
155    * Compute 4 outputs at a time */
156   do
157   {
158     /* Writing the re-ordered output back to inplace input buffer */
159     *pbuff++ = *pS1++;
160     *pbuff++ = *pS1++;
161     *pbuff++ = *pS1++;
162     *pbuff++ = *pS1++;
163 
164     /* Decrement the loop counter */
165     i--;
166   } while(i > 0u);
167 
168 
169   /* ---------------------------------------------------------
170    *     Step2: Calculate RFFT for N-point input
171    * ---------------------------------------------------------- */
172   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
173   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
174 
175  /*----------------------------------------------------------------------
176   *  Step3: Multiply the FFT output with the weights.
177   *----------------------------------------------------------------------*/
178   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
179 
180   /* The output of complex multiplication is in 3.13 format.
181    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
182   arm_shift_q15(pState, 2, pState, S->N * 2);
183 
184   /* ----------- Post-processing ---------- */
185   /* DCT-IV can be obtained from DCT-II by the equation,
186    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
187    *       Hence, Y4(0) = Y2(0)/2  */
188   /* Getting only real part from the output and Converting to DCT-IV */
189 
190   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
191   i = ((uint32_t) S->N - 1u) >> 2u;
192 
193   /* pbuff initialized to input buffer. */
194   pbuff = pInlineBuffer;
195 
196   /* pS1 initialized to pState */
197   pS1 = pState;
198 
199   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
200   in = *pS1++ >> 1u;
201   /* input buffer acts as inplace, so output values are stored in the input itself. */
202   *pbuff++ = in;
203 
204   /* pState pointer is incremented twice as the real values are located alternatively in the array */
205   pS1++;
206 
207   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
208    ** a second loop below computes the remaining 1 to 3 samples. */
209   do
210   {
211     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
212     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
213     in = *pS1++ - in;
214     *pbuff++ = in;
215     /* points to the next real value */
216     pS1++;
217 
218     in = *pS1++ - in;
219     *pbuff++ = in;
220     pS1++;
221 
222     in = *pS1++ - in;
223     *pbuff++ = in;
224     pS1++;
225 
226     in = *pS1++ - in;
227     *pbuff++ = in;
228     pS1++;
229 
230     /* Decrement the loop counter */
231     i--;
232   } while(i > 0u);
233 
234   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
235    ** No loop unrolling is used. */
236   i = ((uint32_t) S->N - 1u) % 0x4u;
237 
238   while(i > 0u)
239   {
240     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
241     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
242     in = *pS1++ - in;
243     *pbuff++ = in;
244     /* points to the next real value */
245     pS1++;
246 
247     /* Decrement the loop counter */
248     i--;
249   }
250 
251 
252    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
253 
254   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
255   i = (uint32_t) S->N >> 2u;
256 
257   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
258   pbuff = pInlineBuffer;
259 
260   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
261   do
262   {
263     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
264     in = *pbuff;
265     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
266 
267     in = *pbuff;
268     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
269 
270     in = *pbuff;
271     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
272 
273     in = *pbuff;
274     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
275 
276     /* Decrement the loop counter */
277     i--;
278   } while(i > 0u);
279 
280 
281 #else
282 
283   /* Run the below code for Cortex-M0 */
284 
285   /* Initializing the loop counter to N/2 */
286   i = (uint32_t) S->Nby2;
287 
288   do
289   {
290     /* Re-ordering of even and odd elements */
291     /* pState[i] =  pInlineBuffer[2*i] */
292     *pS1++ = *pbuff++;
293     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
294     *pS2-- = *pbuff++;
295 
296     /* Decrement the loop counter */
297     i--;
298   } while(i > 0u);
299 
300   /* pbuff initialized to input buffer */
301   pbuff = pInlineBuffer;
302 
303   /* pS1 initialized to pState */
304   pS1 = pState;
305 
306   /* Initializing the loop counter */
307   i = (uint32_t) S->N;
308 
309   do
310   {
311     /* Writing the re-ordered output back to inplace input buffer */
312     *pbuff++ = *pS1++;
313 
314     /* Decrement the loop counter */
315     i--;
316   } while(i > 0u);
317 
318 
319   /* ---------------------------------------------------------
320    *     Step2: Calculate RFFT for N-point input
321    * ---------------------------------------------------------- */
322   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
323   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
324 
325  /*----------------------------------------------------------------------
326   *  Step3: Multiply the FFT output with the weights.
327   *----------------------------------------------------------------------*/
328   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
329 
330   /* The output of complex multiplication is in 3.13 format.
331    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
332   arm_shift_q15(pState, 2, pState, S->N * 2);
333 
334   /* ----------- Post-processing ---------- */
335   /* DCT-IV can be obtained from DCT-II by the equation,
336    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
337    *       Hence, Y4(0) = Y2(0)/2  */
338   /* Getting only real part from the output and Converting to DCT-IV */
339 
340   /* Initializing the loop counter */
341   i = ((uint32_t) S->N - 1u);
342 
343   /* pbuff initialized to input buffer. */
344   pbuff = pInlineBuffer;
345 
346   /* pS1 initialized to pState */
347   pS1 = pState;
348 
349   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
350   in = *pS1++ >> 1u;
351   /* input buffer acts as inplace, so output values are stored in the input itself. */
352   *pbuff++ = in;
353 
354   /* pState pointer is incremented twice as the real values are located alternatively in the array */
355   pS1++;
356 
357   do
358   {
359     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
360     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
361     in = *pS1++ - in;
362     *pbuff++ = in;
363     /* points to the next real value */
364     pS1++;
365 
366     /* Decrement the loop counter */
367     i--;
368   } while(i > 0u);
369 
370    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
371 
372   /* Initializing the loop counter */
373   i = (uint32_t) S->N;
374 
375   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
376   pbuff = pInlineBuffer;
377 
378   do
379   {
380     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
381     in = *pbuff;
382     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
383 
384     /* Decrement the loop counter */
385     i--;
386   } while(i > 0u);
387 
388 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
389 
390 }
391 
392 /**
393    * @} end of DCT4_IDCT4 group
394    */
395