1 /******************************************************************** 2 * * 3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 7 * * 8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * 9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 10 * * 11 ******************************************************************** 12 13 function: 14 last mod: $Id$ 15 16 ********************************************************************/ 17 18 /*Some common macros for potential platform-specific optimization.*/ 19 #include <math.h> 20 #if !defined(_ocintrin_H) 21 # define _ocintrin_H (1) 22 23 /*Some specific platforms may have optimized intrinsic or inline assembly 24 versions of these functions which can substantially improve performance. 25 We define macros for them to allow easy incorporation of these non-ANSI 26 features.*/ 27 28 /*Note that we do not provide a macro for abs(), because it is provided as a 29 library function, which we assume is translated into an intrinsic to avoid 30 the function call overhead and then implemented in the smartest way for the 31 target platform. 32 With modern gcc (4.x), this is true: it uses cmov instructions if the 33 architecture supports it and branchless bit-twiddling if it does not (the 34 speed difference between the two approaches is not measurable). 35 Interestingly, the bit-twiddling method was patented in 2000 (US 6,073,150) 36 by Sun Microsystems, despite prior art dating back to at least 1996: 37 http://web.archive.org/web/19961201174141/www.x86.org/ftp/articles/pentopt/PENTOPT.TXT 38 On gcc 3.x, however, our assumption is not true, as abs() is translated to a 39 conditional jump, which is horrible on deeply piplined architectures (e.g., 40 all consumer architectures for the past decade or more). 41 Also be warned that -C*abs(x) where C is a constant is mis-optimized as 42 abs(C*x) on every gcc release before 4.2.3. 43 See bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34130 */ 44 45 /*Modern gcc (4.x) can compile the naive versions of min and max with cmov if 46 given an appropriate architecture, but the branchless bit-twiddling versions 47 are just as fast, and do not require any special target architecture. 48 Earlier gcc versions (3.x) compiled both code to the same assembly 49 instructions, because of the way they represented ((_b)>(_a)) internally.*/ 50 #define OC_MAXI(_a,_b) ((_a)-((_a)-(_b)&-((_b)>(_a)))) 51 #define OC_MINI(_a,_b) ((_a)+((_b)-(_a)&-((_b)<(_a)))) 52 /*Clamps an integer into the given range. 53 If _a>_c, then the lower bound _a is respected over the upper bound _c (this 54 behavior is required to meet our documented API behavior). 55 _a: The lower bound. 56 _b: The value to clamp. 57 _c: The upper boud.*/ 58 #define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c))) 59 #define OC_CLAMP255(_x) ((unsigned char)((((_x)<0)-1)&((_x)|-((_x)>255)))) 60 /*This has a chance of compiling branchless, and is just as fast as the 61 bit-twiddling method, which is slightly less portable, since it relies on a 62 sign-extended rightshift, which is not guaranteed by ANSI (but present on 63 every relevant platform).*/ 64 #define OC_SIGNI(_a) (((_a)>0)-((_a)<0)) 65 /*Slightly more portable than relying on a sign-extended right-shift (which is 66 not guaranteed by ANSI), and just as fast, since gcc (3.x and 4.x both) 67 compile it into the right-shift anyway.*/ 68 #define OC_SIGNMASK(_a) (-((_a)<0)) 69 /*Divides an integer by a power of two, truncating towards 0. 70 _dividend: The integer to divide. 71 _shift: The non-negative power of two to divide by. 72 _rmask: (1<<_shift)-1*/ 73 #define OC_DIV_POW2(_dividend,_shift,_rmask)\ 74 ((_dividend)+(OC_SIGNMASK(_dividend)&(_rmask))>>(_shift)) 75 /*Divides _x by 65536, truncating towards 0.*/ 76 #define OC_DIV2_16(_x) OC_DIV_POW2(_x,16,0xFFFF) 77 /*Divides _x by 2, truncating towards 0.*/ 78 #define OC_DIV2(_x) OC_DIV_POW2(_x,1,0x1) 79 /*Divides _x by 8, truncating towards 0.*/ 80 #define OC_DIV8(_x) OC_DIV_POW2(_x,3,0x7) 81 /*Divides _x by 16, truncating towards 0.*/ 82 #define OC_DIV16(_x) OC_DIV_POW2(_x,4,0xF) 83 /*Right shifts _dividend by _shift, adding _rval, and subtracting one for 84 negative dividends first. 85 When _rval is (1<<_shift-1), this is equivalent to division with rounding 86 ties away from zero.*/ 87 #define OC_DIV_ROUND_POW2(_dividend,_shift,_rval)\ 88 ((_dividend)+OC_SIGNMASK(_dividend)+(_rval)>>(_shift)) 89 /*Divides a _x by 2, rounding towards even numbers.*/ 90 #define OC_DIV2_RE(_x) ((_x)+((_x)>>1&1)>>1) 91 /*Divides a _x by (1<<(_shift)), rounding towards even numbers.*/ 92 #define OC_DIV_POW2_RE(_x,_shift) \ 93 ((_x)+((_x)>>(_shift)&1)+((1<<(_shift))-1>>1)>>(_shift)) 94 /*Swaps two integers _a and _b if _a>_b.*/ 95 #define OC_SORT2I(_a,_b) \ 96 do{ \ 97 int t__; \ 98 t__=((_a)^(_b))&-((_b)<(_a)); \ 99 (_a)^=t__; \ 100 (_b)^=t__; \ 101 } \ 102 while(0) 103 104 /*Accesses one of four (signed) bytes given an index. 105 This can be used to avoid small lookup tables.*/ 106 #define OC_BYTE_TABLE32(_a,_b,_c,_d,_i) \ 107 ((signed char) \ 108 (((_a)&0xFF|((_b)&0xFF)<<8|((_c)&0xFF)<<16|((_d)&0xFF)<<24)>>(_i)*8)) 109 /*Accesses one of eight (unsigned) nibbles given an index. 110 This can be used to avoid small lookup tables.*/ 111 #define OC_UNIBBLE_TABLE32(_a,_b,_c,_d,_e,_f,_g,_h,_i) \ 112 ((((_a)&0xF|((_b)&0xF)<<4|((_c)&0xF)<<8|((_d)&0xF)<<12| \ 113 ((_e)&0xF)<<16|((_f)&0xF)<<20|((_g)&0xF)<<24|((_h)&0xF)<<28)>>(_i)*4)&0xF) 114 115 116 117 /*All of these macros should expect floats as arguments.*/ 118 #define OC_MAXF(_a,_b) ((_a)<(_b)?(_b):(_a)) 119 #define OC_MINF(_a,_b) ((_a)>(_b)?(_b):(_a)) 120 #define OC_CLAMPF(_a,_b,_c) (OC_MINF(_a,OC_MAXF(_b,_c))) 121 #define OC_FABSF(_f) ((float)fabs(_f)) 122 #define OC_SQRTF(_f) ((float)sqrt(_f)) 123 #define OC_POWF(_b,_e) ((float)pow(_b,_e)) 124 #define OC_LOGF(_f) ((float)log(_f)) 125 #define OC_IFLOORF(_f) ((int)floor(_f)) 126 #define OC_ICEILF(_f) ((int)ceil(_f)) 127 128 #endif 129