1 #include "mathops.h"
2 #include <limits.h>
3
4 /*The fastest fallback strategy for platforms with fast multiplication appears
5 to be based on de Bruijn sequences~\cite{LP98}.
6 Tests confirmed this to be true even on an ARM11, where it is actually faster
7 than using the native clz instruction.
8 Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
9 multiplication or table lookups are too expensive.
10
11 @UNPUBLISHED{LP98,
12 author="Charles E. Leiserson and Harald Prokop",
13 title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
14 month=Jun,
15 year=1998,
16 note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
17 }*/
18 #if !defined(OC_ILOG_NODEBRUIJN)&& \
19 !defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL
20 static const unsigned char OC_DEBRUIJN_IDX32[32]={
21 0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
22 31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
23 };
24 #endif
25
oc_ilog32(ogg_uint32_t _v)26 int oc_ilog32(ogg_uint32_t _v){
27 #if defined(OC_CLZ32)
28 return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v;
29 #else
30 /*On a Pentium M, this branchless version tested as the fastest version without
31 multiplications on 1,000,000,000 random 32-bit integers, edging out a
32 similar version with branches, and a 256-entry LUT version.*/
33 # if defined(OC_ILOG_NODEBRUIJN)
34 int ret;
35 int m;
36 ret=_v>0;
37 m=(_v>0xFFFFU)<<4;
38 _v>>=m;
39 ret|=m;
40 m=(_v>0xFFU)<<3;
41 _v>>=m;
42 ret|=m;
43 m=(_v>0xFU)<<2;
44 _v>>=m;
45 ret|=m;
46 m=(_v>3)<<1;
47 _v>>=m;
48 ret|=m;
49 ret+=_v>1;
50 return ret;
51 /*This de Bruijn sequence version is faster if you have a fast multiplier.*/
52 # else
53 int ret;
54 ret=_v>0;
55 _v|=_v>>1;
56 _v|=_v>>2;
57 _v|=_v>>4;
58 _v|=_v>>8;
59 _v|=_v>>16;
60 _v=(_v>>1)+1;
61 ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
62 return ret;
63 # endif
64 #endif
65 }
66
oc_ilog64(ogg_int64_t _v)67 int oc_ilog64(ogg_int64_t _v){
68 #if defined(OC_CLZ64)
69 return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v;
70 #else
71 # if defined(OC_ILOG_NODEBRUIJN)
72 ogg_uint32_t v;
73 int ret;
74 int m;
75 ret=_v>0;
76 m=(_v>0xFFFFFFFFU)<<5;
77 v=(ogg_uint32_t)(_v>>m);
78 ret|=m;
79 m=(v>0xFFFFU)<<4;
80 v>>=m;
81 ret|=m;
82 m=(v>0xFFU)<<3;
83 v>>=m;
84 ret|=m;
85 m=(v>0xFU)<<2;
86 v>>=m;
87 ret|=m;
88 m=(v>3)<<1;
89 v>>=m;
90 ret|=m;
91 ret+=v>1;
92 return ret;
93 # else
94 /*If we don't have a 64-bit word, split it into two 32-bit halves.*/
95 # if LONG_MAX<9223372036854775807LL
96 ogg_uint32_t v;
97 int ret;
98 int m;
99 ret=_v>0;
100 m=(_v>0xFFFFFFFFU)<<5;
101 v=(ogg_uint32_t)(_v>>m);
102 ret|=m;
103 v|=v>>1;
104 v|=v>>2;
105 v|=v>>4;
106 v|=v>>8;
107 v|=v>>16;
108 v=(v>>1)+1;
109 ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
110 return ret;
111 /*Otherwise do it in one 64-bit operation.*/
112 # else
113 static const unsigned char OC_DEBRUIJN_IDX64[64]={
114 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
115 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
116 63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
117 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
118 };
119 int ret;
120 ret=_v>0;
121 _v|=_v>>1;
122 _v|=_v>>2;
123 _v|=_v>>4;
124 _v|=_v>>8;
125 _v|=_v>>16;
126 _v|=_v>>32;
127 _v=(_v>>1)+1;
128 ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
129 return ret;
130 # endif
131 # endif
132 #endif
133 }
134
135 /*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/
136 static const ogg_int64_t OC_ATANH_LOG2[32]={
137 0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
138 0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
139 0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
140 0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
141 0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
142 0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
143 0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
144 0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
145 0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
146 0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
147 0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
148 };
149
150 /*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
oc_bexp64(ogg_int64_t _z)151 ogg_int64_t oc_bexp64(ogg_int64_t _z){
152 ogg_int64_t w;
153 ogg_int64_t z;
154 int ipart;
155 ipart=(int)(_z>>57);
156 if(ipart<0)return 0;
157 if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
158 z=_z-OC_Q57(ipart);
159 if(z){
160 ogg_int64_t mask;
161 long wlo;
162 int i;
163 /*C doesn't give us 64x64->128 muls, so we use CORDIC.
164 This is not particularly fast, but it's not being used in time-critical
165 code; it is very accurate.*/
166 /*z is the fractional part of the log in Q62 format.
167 We need 1 bit of headroom since the magnitude can get larger than 1
168 during the iteration, and a sign bit.*/
169 z<<=5;
170 /*w is the exponential in Q61 format (since it also needs headroom and can
171 get as large as 2.0); we could get another bit if we dropped the sign,
172 but we'll recover that bit later anyway.
173 Ideally this should start out as
174 \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
175 but in order to guarantee convergence we have to repeat iterations 4,
176 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
177 w=0x26A3D0E401DD846DLL;
178 for(i=0;;i++){
179 mask=-(z<0);
180 w+=(w>>i+1)+mask^mask;
181 z-=OC_ATANH_LOG2[i]+mask^mask;
182 /*Repeat iteration 4.*/
183 if(i>=3)break;
184 z<<=1;
185 }
186 for(;;i++){
187 mask=-(z<0);
188 w+=(w>>i+1)+mask^mask;
189 z-=OC_ATANH_LOG2[i]+mask^mask;
190 /*Repeat iteration 13.*/
191 if(i>=12)break;
192 z<<=1;
193 }
194 for(;i<32;i++){
195 mask=-(z<0);
196 w+=(w>>i+1)+mask^mask;
197 z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
198 }
199 wlo=0;
200 /*Skip the remaining iterations unless we really require that much
201 precision.
202 We could have bailed out earlier for smaller iparts, but that would
203 require initializing w from a table, as the limit doesn't converge to
204 61-bit precision until n=30.*/
205 if(ipart>30){
206 /*For these iterations, we just update the low bits, as the high bits
207 can't possibly be affected.
208 OC_ATANH_LOG2 has also converged (it actually did so one iteration
209 earlier, but that's no reason for an extra special case).*/
210 for(;;i++){
211 mask=-(z<0);
212 wlo+=(w>>i)+mask^mask;
213 z-=OC_ATANH_LOG2[31]+mask^mask;
214 /*Repeat iteration 40.*/
215 if(i>=39)break;
216 z<<=1;
217 }
218 for(;i<61;i++){
219 mask=-(z<0);
220 wlo+=(w>>i)+mask^mask;
221 z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
222 }
223 }
224 w=(w<<1)+wlo;
225 }
226 else w=(ogg_int64_t)1<<62;
227 if(ipart<62)w=(w>>61-ipart)+1>>1;
228 return w;
229 }
230
231 /*Computes the binary logarithm of _w, returned in Q57 format.*/
oc_blog64(ogg_int64_t _w)232 ogg_int64_t oc_blog64(ogg_int64_t _w){
233 ogg_int64_t z;
234 int ipart;
235 if(_w<=0)return -1;
236 ipart=OC_ILOGNZ_64(_w)-1;
237 if(ipart>61)_w>>=ipart-61;
238 else _w<<=61-ipart;
239 z=0;
240 if(_w&_w-1){
241 ogg_int64_t x;
242 ogg_int64_t y;
243 ogg_int64_t u;
244 ogg_int64_t mask;
245 int i;
246 /*C doesn't give us 64x64->128 muls, so we use CORDIC.
247 This is not particularly fast, but it's not being used in time-critical
248 code; it is very accurate.*/
249 /*z is the fractional part of the log in Q61 format.*/
250 /*x and y are the cosh() and sinh(), respectively, in Q61 format.
251 We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/
252 x=_w+((ogg_int64_t)1<<61);
253 y=_w-((ogg_int64_t)1<<61);
254 for(i=0;i<4;i++){
255 mask=-(y<0);
256 z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
257 u=x>>i+1;
258 x-=(y>>i+1)+mask^mask;
259 y-=u+mask^mask;
260 }
261 /*Repeat iteration 4.*/
262 for(i--;i<13;i++){
263 mask=-(y<0);
264 z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
265 u=x>>i+1;
266 x-=(y>>i+1)+mask^mask;
267 y-=u+mask^mask;
268 }
269 /*Repeat iteration 13.*/
270 for(i--;i<32;i++){
271 mask=-(y<0);
272 z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
273 u=x>>i+1;
274 x-=(y>>i+1)+mask^mask;
275 y-=u+mask^mask;
276 }
277 /*OC_ATANH_LOG2 has converged.*/
278 for(;i<40;i++){
279 mask=-(y<0);
280 z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
281 u=x>>i+1;
282 x-=(y>>i+1)+mask^mask;
283 y-=u+mask^mask;
284 }
285 /*Repeat iteration 40.*/
286 for(i--;i<62;i++){
287 mask=-(y<0);
288 z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
289 u=x>>i+1;
290 x-=(y>>i+1)+mask^mask;
291 y-=u+mask^mask;
292 }
293 z=z+8>>4;
294 }
295 return OC_Q57(ipart)+z;
296 }
297