1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggVorbis 'TREMOR' CODEC SOURCE CODE.   *
4  *                                                                  *
5  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
6  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
7  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
8  *                                                                  *
9  * THE OggVorbis 'TREMOR' SOURCE CODE IS (C) COPYRIGHT 1994-2002    *
10  * BY THE Xiph.Org FOUNDATION http://www.xiph.org/                  *
11  *                                                                  *
12  ********************************************************************
13 
14  function: arm7 and later wide math functions
15 
16  ********************************************************************/
17 #ifdef CPU_ARM
18 
19 #define INCL_OPTIMIZED_MULT32
20 #if ARM_ARCH >= 6
MULT32(int32_t x,int32_t y)21 static inline int32_t MULT32(int32_t x, int32_t y) {
22   int32_t hi;
23   asm volatile("smmul %[hi], %[x], %[y] \n\t"
24                : [hi] "=&r" (hi)
25                : [x] "r" (x), [y] "r" (y) );
26   return(hi);
27 }
28 #else
MULT32(int32_t x,int32_t y)29 static inline int32_t MULT32(int32_t x, int32_t y) {
30   int32_t lo, hi;
31   asm volatile("smull\t%0, %1, %2, %3 \n\t"
32                : "=&r"(lo),"=&r"(hi)
33                : "r"(x),"r"(y) );
34   return(hi);
35 }
36 #endif
37 
38 #define INCL_OPTIMIZED_MULT31
MULT31(int32_t x,int32_t y)39 static inline int32_t MULT31(int32_t x, int32_t y) {
40   return MULT32(x,y)<<1;
41 }
42 
43 #define INCL_OPTIMIZED_MULT31_SHIFT15
MULT31_SHIFT15(int32_t x,int32_t y)44 static inline int32_t MULT31_SHIFT15(int32_t x, int32_t y) {
45   int32_t lo,hi;
46   asm volatile("smull   %0, %1, %2, %3\n\t"
47                "movs    %0, %0, lsr #15\n\t"
48                "adc %1, %0, %1, lsl #17\n\t"
49                : "=&r"(lo),"=&r"(hi)
50                : "r"(x),"r"(y)
51                : "cc" );
52   return(hi);
53 }
54 
55 #define INCL_OPTIMIZED_MULT31_SHIFT16
MULT31_SHIFT16(int32_t x,int32_t y)56 static inline int32_t MULT31_SHIFT16(int32_t x, int32_t y) {
57   int32_t lo,hi;
58   asm volatile("smull   %0, %1, %2, %3\n\t"
59                "movs    %0, %0, lsr #16\n\t"
60                "adc %1, %0, %1, lsl #16\n\t"
61                : "=&r"(lo),"=&r"(hi)
62                : "r"(x),"r"(y)
63                : "cc" );
64   return(hi);
65 }
66 
67 #define INCL_OPTIMIZED_XPROD32
68 #define XPROD32(a, b, t, v, x, y) \
69 { \
70   int32_t l; \
71   asm("smull  %0, %1, %3, %5\n\t" \
72       "rsb    %2, %6, #0\n\t" \
73       "smlal  %0, %1, %4, %6\n\t" \
74       "smull  %0, %2, %3, %2\n\t" \
75       "smlal  %0, %2, %4, %5" \
76       : "=&r" (l), "=&r" (x), "=&r" (y) \
77       : "r" ((a)), "r" ((b)), "r" ((t)), "r" ((v)) ); \
78 }
79 
80 #define INCL_OPTIMIZED_XPROD31_R
81 #define INCL_OPTIMIZED_XNPROD31_R
82 #if ARM_ARCH >= 6
83 /* These may yield slightly different result from the macros below
84    because only the high 32 bits of the multiplications are accumulated while
85    the below macros use a 64 bit accumulator that is truncated to 32 bits.*/
86 #define XPROD31_R(_a, _b, _t, _v, _x, _y)\
87 {\
88   int32_t x1, y1;\
89   asm("smmul  %[x1], %[t], %[a] \n\t"\
90       "smmul  %[y1], %[t], %[b] \n\t"\
91       "smmla  %[x1], %[v], %[b], %[x1] \n\t"\
92       "smmls  %[y1], %[v], %[a], %[y1] \n\t"\
93       : [x1] "=&r" (x1), [y1] "=&r" (y1)\
94       : [a] "r" (_a), [b] "r" (_b), [t] "r" (_t), [v] "r" (_v) );\
95   _x = x1 << 1;\
96   _y = y1 << 1;\
97 }
98 
99 #define XNPROD31_R(_a, _b, _t, _v, _x, _y)\
100 {\
101   int32_t x1, y1;\
102   asm("smmul  %[x1], %[t], %[a] \n\t"\
103       "smmul  %[y1], %[t], %[b] \n\t"\
104       "smmls  %[x1], %[v], %[b], %[x1] \n\t"\
105       "smmla  %[y1], %[v], %[a], %[y1] \n\t"\
106       : [x1] "=&r" (x1), [y1] "=&r" (y1)\
107       : [a] "r" (_a), [b] "r" (_b), [t] "r" (_t), [v] "r" (_v) );\
108   _x = x1 << 1;\
109   _y = y1 << 1;\
110 }
111 #else
112 #define XPROD31_R(_a, _b, _t, _v, _x, _y)\
113 {\
114   int32_t x1, y1, l;\
115   asm("smull  %0, %1, %5, %3\n\t"\
116       "rsb    %2, %3, #0\n\t"\
117       "smlal  %0, %1, %6, %4\n\t"\
118       "smull  %0, %2, %6, %2\n\t"\
119       "smlal  %0, %2, %5, %4"\
120       : "=&r" (l), "=&r" (x1), "=&r" (y1)\
121       : "r" (_a), "r" (_b), "r" (_t), "r" (_v) );\
122   _x = x1 << 1;\
123   _y = y1 << 1;\
124 }
125 
126 #define XNPROD31_R(_a, _b, _t, _v, _x, _y)\
127 {\
128   int32_t x1, y1, l;\
129   asm("smull  %0, %1, %5, %3\n\t"\
130       "rsb    %2, %4, #0\n\t"\
131       "smlal  %0, %1, %6, %2\n\t"\
132       "smull  %0, %2, %5, %4\n\t"\
133       "smlal  %0, %2, %6, %3"\
134       : "=&r" (l), "=&r" (x1), "=&r" (y1)\
135       : "r" (_a), "r" (_b), "r" (_t), "r" (_v) );\
136   _x = x1 << 1;\
137   _y = y1 << 1;\
138 }
139 #endif
140 
141 #define INCL_OPTIMIZED_XPROD31
XPROD31(int32_t a,int32_t b,int32_t t,int32_t v,int32_t * x,int32_t * y)142 static inline void XPROD31(int32_t  a, int32_t  b,
143                            int32_t  t, int32_t  v,
144                            int32_t *x, int32_t *y)
145 {
146   int32_t _x1, _y1;
147   XPROD31_R(a, b, t, v, _x1, _y1);
148   *x = _x1;
149   *y = _y1;
150 }
151 
152 #define INCL_OPTIMIZED_XNPROD31
XNPROD31(int32_t a,int32_t b,int32_t t,int32_t v,int32_t * x,int32_t * y)153 static inline void XNPROD31(int32_t  a, int32_t  b,
154                             int32_t  t, int32_t  v,
155                             int32_t *x, int32_t *y)
156 {
157   int32_t _x1, _y1;
158   XNPROD31_R(a, b, t, v, _x1, _y1);
159   *x = _x1;
160   *y = _y1;
161 }
162 
163 
164 #ifndef _V_VECT_OPS
165 #define _V_VECT_OPS
166 
167 /* asm versions of vector operations for block.c, window.c */
168 static inline
vect_add(int32_t * x,const int32_t * y,int n)169 void vect_add(int32_t *x, const int32_t *y, int n)
170 {
171   while (n>=4) {
172     asm volatile ("ldmia %[x], {r0, r1, r2, r3};"
173                   "ldmia %[y]!, {r4, r5, r6, r7};"
174                   "add r0, r0, r4;"
175                   "add r1, r1, r5;"
176                   "add r2, r2, r6;"
177                   "add r3, r3, r7;"
178                   "stmia %[x]!, {r0, r1, r2, r3};"
179                   : [x] "+r" (x), [y] "+r" (y)
180                   : : "r0", "r1", "r2", "r3",
181                   "r4", "r5", "r6", "r7",
182                   "memory");
183     n -= 4;
184   }
185   /* add final elements */
186   while (n>0) {
187     *x++ += *y++;
188     n--;
189   }
190 }
191 
192 static inline
vect_copy(int32_t * x,const int32_t * y,int n)193 void vect_copy(int32_t *x, const int32_t *y, int n)
194 {
195   while (n>=4) {
196     asm volatile ("ldmia %[y]!, {r0, r1, r2, r3};"
197                   "stmia %[x]!, {r0, r1, r2, r3};"
198                   : [x] "+r" (x), [y] "+r" (y)
199                   : : "r0", "r1", "r2", "r3",
200                   "memory");
201     n -= 4;
202   }
203   /* copy final elements */
204   while (n>0) {
205     *x++ = *y++;
206     n--;
207   }
208 }
209 
210 static inline
vect_mult_fw(int32_t * data,const int32_t * window,int n)211 void vect_mult_fw(int32_t *data, const int32_t *window, int n)
212 {
213   while (n>=4) {
214     asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
215                   "ldmia %[w]!, {r4, r5, r6, r7};"
216                   "smull r8, r9, r0, r4;"
217                   "mov   r0, r9, lsl #1;"
218                   "smull r8, r9, r1, r5;"
219                   "mov   r1, r9, lsl #1;"
220                   "smull r8, r9, r2, r6;"
221                   "mov   r2, r9, lsl #1;"
222                   "smull r8, r9, r3, r7;"
223                   "mov   r3, r9, lsl #1;"
224                   "stmia %[d]!, {r0, r1, r2, r3};"
225                   : [d] "+r" (data), [w] "+r" (window)
226                   : : "r0", "r1", "r2", "r3",
227                   "r4", "r5", "r6", "r7", "r8", "r9",
228                   "memory" );
229     n -= 4;
230   }
231   while(n>0) {
232     *data = MULT31(*data, *window);
233     data++;
234     window++;
235     n--;
236   }
237 }
238 
239 static inline
vect_mult_bw(int32_t * data,const int32_t * window,int n)240 void vect_mult_bw(int32_t *data, const int32_t *window, int n)
241 {
242   while (n>=4) {
243     asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
244                   "ldmda %[w]!, {r4, r5, r6, r7};"
245                   "smull r8, r9, r0, r7;"
246                   "mov   r0, r9, lsl #1;"
247                   "smull r8, r9, r1, r6;"
248                   "mov   r1, r9, lsl #1;"
249                   "smull r8, r9, r2, r5;"
250                   "mov   r2, r9, lsl #1;"
251                   "smull r8, r9, r3, r4;"
252                   "mov   r3, r9, lsl #1;"
253                   "stmia %[d]!, {r0, r1, r2, r3};"
254                   : [d] "+r" (data), [w] "+r" (window)
255                   : : "r0", "r1", "r2", "r3",
256                   "r4", "r5", "r6", "r7", "r8", "r9",
257                   "memory" );
258     n -= 4;
259   }
260   while(n>0) {
261     *data = MULT31(*data, *window);
262     data++;
263     window--;
264     n--;
265   }
266 }
267 
268 #endif
269 
270 /* not used anymore */
271 /*
272 #ifndef _V_CLIP_MATH
273 #define _V_CLIP_MATH
274 
275 static inline int32_t CLIP_TO_15(int32_t x) {
276   int tmp;
277   asm volatile("subs    %1, %0, #32768\n\t"
278            "movpl   %0, #0x7f00\n\t"
279            "orrpl   %0, %0, #0xff\n"
280            "adds    %1, %0, #32768\n\t"
281            "movmi   %0, #0x8000"
282            : "+r"(x),"=r"(tmp)
283            :
284            : "cc");
285   return(x);
286 }
287 
288 #endif
289 */
290 
291 #endif
292 
293