1 /***************************************************************************** 2 * ppccommon.h: ppc utility macros 3 ***************************************************************************** 4 * Copyright (C) 2003-2014 x264 project 5 * 6 * Authors: Eric Petit <eric.petit@lapsus.org> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 21 * 22 * This program is also available under a commercial proprietary license. 23 * For more information, contact us at licensing@x264.com. 24 *****************************************************************************/ 25 26 #if HAVE_ALTIVEC_H 27 #include <altivec.h> 28 #endif 29 30 /*********************************************************************** 31 * For constant vectors, use parentheses on OS X and braces on Linux 32 **********************************************************************/ 33 #if defined(__APPLE__) && __GNUC__ < 4 34 #define CV(a...) (a) 35 #else 36 #define CV(a...) {a} 37 #endif 38 39 /*********************************************************************** 40 * Vector types 41 **********************************************************************/ 42 #define vec_u8_t vector unsigned char 43 #define vec_s8_t vector signed char 44 #define vec_u16_t vector unsigned short 45 #define vec_s16_t vector signed short 46 #define vec_u32_t vector unsigned int 47 #define vec_s32_t vector signed int 48 49 typedef union { 50 uint32_t s[4]; 51 vec_u32_t v; 52 } vec_u32_u; 53 54 typedef union { 55 uint16_t s[8]; 56 vec_u16_t v; 57 } vec_u16_u; 58 59 typedef union { 60 int16_t s[8]; 61 vec_s16_t v; 62 } vec_s16_u; 63 64 typedef union { 65 uint8_t s[16]; 66 vec_u8_t v; 67 } vec_u8_u; 68 69 /*********************************************************************** 70 * Null vector 71 **********************************************************************/ 72 #define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 ) 73 74 #define zero_u8v (vec_u8_t) zerov 75 #define zero_s8v (vec_s8_t) zerov 76 #define zero_u16v (vec_u16_t) zerov 77 #define zero_s16v (vec_s16_t) zerov 78 #define zero_u32v (vec_u32_t) zerov 79 #define zero_s32v (vec_s32_t) zerov 80 81 /*********************************************************************** 82 * 8 <-> 16 bits conversions 83 **********************************************************************/ 84 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v ) 85 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v ) 86 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v ) 87 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v ) 88 89 #define vec_u8_to_u16(v) vec_u8_to_u16_h(v) 90 #define vec_u8_to_s16(v) vec_u8_to_s16_h(v) 91 92 #define vec_u16_to_u8(v) vec_pack( v, zero_u16v ) 93 #define vec_s16_to_u8(v) vec_packsu( v, zero_s16v ) 94 95 96 /*********************************************************************** 97 * 16 <-> 32 bits conversions 98 **********************************************************************/ 99 #define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v ) 100 #define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v ) 101 #define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v ) 102 #define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v ) 103 104 #define vec_u16_to_u32(v) vec_u16_to_u32_h(v) 105 #define vec_u16_to_s32(v) vec_u16_to_s32_h(v) 106 107 #define vec_u32_to_u16(v) vec_pack( v, zero_u32v ) 108 #define vec_s32_to_u16(v) vec_packsu( v, zero_s32v ) 109 110 111 /*********************************************************************** 112 * PREP_LOAD: declares two vectors required to perform unaligned loads 113 * VEC_LOAD: loads n bytes from u8 * p into vector v of type t where o is from original src offset 114 * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known 115 * VEC_LOAD_OFFSET: as above, but with offset vector known in advance 116 **********************************************************************/ 117 #define PREP_LOAD \ 118 vec_u8_t _hv, _lv 119 120 #define PREP_LOAD_SRC( src ) \ 121 vec_u8_t _##src##_ = vec_lvsl(0, src) 122 123 #define VEC_LOAD_G( p, v, n, t ) \ 124 _hv = vec_ld( 0, p ); \ 125 v = (t) vec_lvsl( 0, p ); \ 126 _lv = vec_ld( n - 1, p ); \ 127 v = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) 128 129 #define VEC_LOAD( p, v, n, t, g ) \ 130 _hv = vec_ld( 0, p ); \ 131 _lv = vec_ld( n - 1, p ); \ 132 v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ ) 133 134 #define VEC_LOAD_OFFSET( p, v, n, t, o ) \ 135 _hv = vec_ld( 0, p); \ 136 _lv = vec_ld( n - 1, p ); \ 137 v = (t) vec_perm( _hv, _lv, (vec_u8_t) o ) 138 139 #define VEC_LOAD_PARTIAL( p, v, n, t, g) \ 140 _hv = vec_ld( 0, p); \ 141 v = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ ) 142 143 144 /*********************************************************************** 145 * PREP_STORE##n: declares required vectors to store n bytes to a 146 * potentially unaligned address 147 * VEC_STORE##n: stores n bytes from vector v to address p 148 **********************************************************************/ 149 #define PREP_STORE16 \ 150 vec_u8_t _tmp1v \ 151 152 #define PREP_STORE16_DST( dst ) \ 153 vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \ 154 vec_u8_t _##dst##r_ = vec_lvsr(0, dst); 155 156 #define VEC_STORE16( v, p, o ) \ 157 _hv = vec_ld( 0, p ); \ 158 _lv = vec_ld( 15, p ); \ 159 _tmp1v = vec_perm( _lv, _hv, _##o##l_ ); \ 160 _lv = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \ 161 vec_st( _lv, 15, (uint8_t *) p ); \ 162 _hv = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \ 163 vec_st( _hv, 0, (uint8_t *) p ) 164 165 166 #define PREP_STORE8 \ 167 vec_u8_t _tmp3v \ 168 169 #define VEC_STORE8( v, p ) \ 170 _tmp3v = vec_lvsl(0, p); \ 171 v = vec_perm(v, v, _tmp3v); \ 172 vec_ste((vec_u32_t)v,0,(uint32_t*)p); \ 173 vec_ste((vec_u32_t)v,4,(uint32_t*)p) 174 175 176 #define PREP_STORE4 \ 177 PREP_STORE16; \ 178 vec_u8_t _tmp2v, _tmp3v; \ 179 const vec_u8_t sel = \ 180 (vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0) 181 182 #define VEC_STORE4( v, p ) \ 183 _tmp3v = vec_lvsr( 0, p ); \ 184 v = vec_perm( v, v, _tmp3v ); \ 185 _lv = vec_ld( 3, p ); \ 186 _tmp1v = vec_perm( sel, zero_u8v, _tmp3v ); \ 187 _lv = vec_sel( _lv, v, _tmp1v ); \ 188 vec_st( _lv, 3, p ); \ 189 _hv = vec_ld( 0, p ); \ 190 _tmp2v = vec_perm( zero_u8v, sel, _tmp3v ); \ 191 _hv = vec_sel( _hv, v, _tmp2v ); \ 192 vec_st( _hv, 0, p ) 193 194 /*********************************************************************** 195 * VEC_TRANSPOSE_8 196 *********************************************************************** 197 * Transposes a 8x8 matrix of s16 vectors 198 **********************************************************************/ 199 #define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ 200 b0 = vec_mergeh( a0, a4 ); \ 201 b1 = vec_mergel( a0, a4 ); \ 202 b2 = vec_mergeh( a1, a5 ); \ 203 b3 = vec_mergel( a1, a5 ); \ 204 b4 = vec_mergeh( a2, a6 ); \ 205 b5 = vec_mergel( a2, a6 ); \ 206 b6 = vec_mergeh( a3, a7 ); \ 207 b7 = vec_mergel( a3, a7 ); \ 208 a0 = vec_mergeh( b0, b4 ); \ 209 a1 = vec_mergel( b0, b4 ); \ 210 a2 = vec_mergeh( b1, b5 ); \ 211 a3 = vec_mergel( b1, b5 ); \ 212 a4 = vec_mergeh( b2, b6 ); \ 213 a5 = vec_mergel( b2, b6 ); \ 214 a6 = vec_mergeh( b3, b7 ); \ 215 a7 = vec_mergel( b3, b7 ); \ 216 b0 = vec_mergeh( a0, a4 ); \ 217 b1 = vec_mergel( a0, a4 ); \ 218 b2 = vec_mergeh( a1, a5 ); \ 219 b3 = vec_mergel( a1, a5 ); \ 220 b4 = vec_mergeh( a2, a6 ); \ 221 b5 = vec_mergel( a2, a6 ); \ 222 b6 = vec_mergeh( a3, a7 ); \ 223 b7 = vec_mergel( a3, a7 ) 224 225 /*********************************************************************** 226 * VEC_TRANSPOSE_4 227 *********************************************************************** 228 * Transposes a 4x4 matrix of s16 vectors. 229 * Actually source and destination are 8x4. The low elements of the 230 * source are discarded and the low elements of the destination mustn't 231 * be used. 232 **********************************************************************/ 233 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ 234 b0 = vec_mergeh( a0, a0 ); \ 235 b1 = vec_mergeh( a1, a0 ); \ 236 b2 = vec_mergeh( a2, a0 ); \ 237 b3 = vec_mergeh( a3, a0 ); \ 238 a0 = vec_mergeh( b0, b2 ); \ 239 a1 = vec_mergel( b0, b2 ); \ 240 a2 = vec_mergeh( b1, b3 ); \ 241 a3 = vec_mergel( b1, b3 ); \ 242 b0 = vec_mergeh( a0, a2 ); \ 243 b1 = vec_mergel( a0, a2 ); \ 244 b2 = vec_mergeh( a1, a3 ); \ 245 b3 = vec_mergel( a1, a3 ) 246 247 /*********************************************************************** 248 * VEC_DIFF_H 249 *********************************************************************** 250 * p1, p2: u8 * 251 * i1, i2, n: int 252 * d: s16v 253 * 254 * Loads n bytes from p1 and p2, do the diff of the high elements into 255 * d, increments p1 and p2 by i1 and i2 into known offset g 256 **********************************************************************/ 257 #define PREP_DIFF \ 258 LOAD_ZERO; \ 259 PREP_LOAD; \ 260 vec_s16_t pix1v, pix2v; 261 262 263 #define VEC_DIFF_H(p1,i1,p2,i2,n,d,g) \ 264 VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \ 265 pix1v = vec_u8_to_s16( pix1v ); \ 266 VEC_LOAD( p2, pix2v, n, vec_s16_t, g); \ 267 pix2v = vec_u8_to_s16( pix2v ); \ 268 d = vec_sub( pix1v, pix2v ); \ 269 p1 += i1; \ 270 p2 += i2 271 272 #define VEC_DIFF_H_OFFSET(p1,i1,p2,i2,n,d,g1,g2) \ 273 pix1v = (vec_s16_t)vec_perm( vec_ld( 0, p1 ), zero_u8v, _##g1##_ );\ 274 pix1v = vec_u8_to_s16( pix1v ); \ 275 VEC_LOAD( p2, pix2v, n, vec_s16_t, g2); \ 276 pix2v = vec_u8_to_s16( pix2v ); \ 277 d = vec_sub( pix1v, pix2v ); \ 278 p1 += i1; \ 279 p2 += i2 280 281 282 /*********************************************************************** 283 * VEC_DIFF_HL 284 *********************************************************************** 285 * p1, p2: u8 * 286 * i1, i2: int 287 * dh, dl: s16v 288 * 289 * Loads 16 bytes from p1 and p2, do the diff of the high elements into 290 * dh, the diff of the low elements into dl, increments p1 and p2 by i1 291 * and i2 292 **********************************************************************/ 293 #define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl) \ 294 pix1v = (vec_s16_t)vec_ld(0, p1); \ 295 temp0v = vec_u8_to_s16_h( pix1v ); \ 296 temp1v = vec_u8_to_s16_l( pix1v ); \ 297 VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \ 298 temp2v = vec_u8_to_s16_h( pix2v ); \ 299 temp3v = vec_u8_to_s16_l( pix2v ); \ 300 dh = vec_sub( temp0v, temp2v ); \ 301 dl = vec_sub( temp1v, temp3v ); \ 302 p1 += i1; \ 303 p2 += i2 304 305 /*********************************************************************** 306 * VEC_DIFF_H_8BYTE_ALIGNED 307 *********************************************************************** 308 * p1, p2: u8 * 309 * i1, i2, n: int 310 * d: s16v 311 * 312 * Loads n bytes from p1 and p2, do the diff of the high elements into 313 * d, increments p1 and p2 by i1 and i2 314 * Slightly faster when we know we are loading/diffing 8bytes which 315 * are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s 316 **********************************************************************/ 317 #define PREP_DIFF_8BYTEALIGNED \ 318 LOAD_ZERO; \ 319 vec_s16_t pix1v, pix2v; \ 320 vec_u8_t pix1v8, pix2v8; \ 321 vec_u8_t permPix1, permPix2; \ 322 permPix1 = vec_lvsl(0, pix1); \ 323 permPix2 = vec_lvsl(0, pix2); \ 324 325 #define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d) \ 326 pix1v8 = vec_perm(vec_ld(0,p1), zero_u8v, permPix1); \ 327 pix2v8 = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \ 328 pix1v = vec_u8_to_s16( pix1v8 ); \ 329 pix2v = vec_u8_to_s16( pix2v8 ); \ 330 d = vec_sub( pix1v, pix2v); \ 331 p1 += i1; \ 332 p2 += i2; 333