1 /* 2 dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec 3 4 copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Michael Hipp 7 altivec optimization by tmkk 8 */ 9 10 /* 11 * Discrete Cosine Tansform (DCT) for subband synthesis 12 * 13 * -funroll-loops (for gcc) will remove the loops for better performance 14 * using loops in the source-code enhances readabillity 15 * 16 * 17 * TODO: write an optimized version for the down-sampling modes 18 * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero 19 */ 20 21 #include "mpg123lib_intern.h" 22 23 #ifndef __APPLE__ 24 #include <altivec.h> 25 #endif 26 27 void dct64_altivec(real *out0,real *out1,real *samples) 28 { 29 ALIGNED(16) real bufs[32]; 30 31 { 32 register real *b1,*costab; 33 34 vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4; 35 vector float v1,v2,v3,v4,v5,v6,v7,v8; 36 vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8; 37 vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16; 38 vector float vzero; 39 b1 = samples; 40 costab = pnts[0]; 41 42 vzero = vec_xor(vzero,vzero); 43 #ifdef __APPLE__ 44 vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); 45 #else 46 vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}; 47 #endif 48 vperm1 = vec_lvsl(0,b1); 49 vperm2 = vec_perm(vperm1,vperm1,vinvert); 50 51 v1 = vec_ld(0,b1); 52 v2 = vec_ld(16,b1); 53 v3 = vec_ld(112,b1); 54 v4 = vec_ld(127,b1); 55 v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */ 56 v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */ 57 58 vbs1 = vec_add(v5,v6); 59 vbs8 = vec_sub(v5,v6); 60 61 v1 = vec_ld(32,b1); 62 v4 = vec_ld(96,b1); 63 v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */ 64 v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */ 65 66 vbs2 = vec_add(v5,v6); 67 vbs7 = vec_sub(v5,v6); 68 69 v2 = vec_ld(48,b1); 70 v3 = vec_ld(80,b1); 71 v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */ 72 v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */ 73 74 vbs3 = vec_add(v5,v6); 75 vbs6 = vec_sub(v5,v6); 76 77 v1 = vec_ld(64,b1); 78 v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */ 79 v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */ 80 81 vbs4 = vec_add(v5,v6); 82 vbs5 = vec_sub(v5,v6); 83 84 v1 = vec_ld(0,costab); 85 vbs8 = vec_madd(vbs8,v1,vzero); 86 v2 = vec_ld(16,costab); 87 vbs7 = vec_madd(vbs7,v2,vzero); 88 v3 = vec_ld(32,costab); 89 vbs6 = vec_madd(vbs6,v3,vzero); 90 v4 = vec_ld(48,costab); 91 vbs5 = vec_madd(vbs5,v4,vzero); 92 vbs6 = vec_perm(vbs6,vbs6,vinvert); 93 vbs5 = vec_perm(vbs5,vbs5,vinvert); 94 95 96 costab = pnts[1]; 97 98 v1 = vec_perm(vbs4,vbs4,vinvert); 99 vbs9 = vec_add(vbs1,v1); 100 v3 = vec_sub(vbs1,v1); 101 v5 = vec_ld(0,costab); 102 v2 = vec_perm(vbs3,vbs3,vinvert); 103 vbs10 = vec_add(vbs2,v2); 104 v4 = vec_sub(vbs2,v2); 105 v6 = vec_ld(16,costab); 106 vbs12 = vec_madd(v3,v5,vzero); 107 vbs11 = vec_madd(v4,v6,vzero); 108 109 v7 = vec_sub(vbs7,vbs6); 110 v8 = vec_sub(vbs8,vbs5); 111 vbs13 = vec_add(vbs5,vbs8); 112 vbs14 = vec_add(vbs6,vbs7); 113 vbs15 = vec_madd(v7,v6,vzero); 114 vbs16 = vec_madd(v8,v5,vzero); 115 116 117 costab = pnts[2]; 118 119 v1 = vec_perm(vbs10,vbs10,vinvert); 120 v5 = vec_perm(vbs14,vbs14,vinvert); 121 vbs1 = vec_add(v1,vbs9); 122 vbs5 = vec_add(v5,vbs13); 123 v2 = vec_sub(vbs9,v1); 124 v6 = vec_sub(vbs13,v5); 125 v3 = vec_ld(0,costab); 126 vbs11 = vec_perm(vbs11,vbs11,vinvert); 127 vbs15 = vec_perm(vbs15,vbs15,vinvert); 128 vbs3 = vec_add(vbs11,vbs12); 129 vbs7 = vec_add(vbs15,vbs16); 130 v4 = vec_sub(vbs12,vbs11); 131 v7 = vec_sub(vbs16,vbs15); 132 vbs2 = vec_madd(v2,v3,vzero); 133 vbs4 = vec_madd(v4,v3,vzero); 134 vbs6 = vec_madd(v6,v3,vzero); 135 vbs8 = vec_madd(v7,v3,vzero); 136 137 vbs2 = vec_perm(vbs2,vbs2,vinvert); 138 vbs4 = vec_perm(vbs4,vbs4,vinvert); 139 vbs6 = vec_perm(vbs6,vbs6,vinvert); 140 vbs8 = vec_perm(vbs8,vbs8,vinvert); 141 142 143 costab = pnts[3]; 144 145 #ifdef __APPLE__ 146 vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); 147 vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27); 148 vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19); 149 #else 150 vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}; 151 vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27}; 152 vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19}; 153 #endif 154 vperm4 = vec_add(vperm3,vec_splat_u8(8)); 155 156 v1 = vec_ld(0,costab); 157 v2 = vec_splat(v1,0); 158 v3 = vec_splat(v1,1); 159 v1 = vec_mergeh(v2,v3); 160 161 v2 = vec_perm(vbs1,vbs3,vperm1); 162 v3 = vec_perm(vbs2,vbs4,vperm1); 163 v4 = vec_perm(vbs1,vbs3,vperm2); 164 v5 = vec_perm(vbs2,vbs4,vperm2); 165 v6 = vec_sub(v2,v4); 166 v7 = vec_sub(v3,v5); 167 v2 = vec_add(v2,v4); 168 v3 = vec_add(v3,v5); 169 v4 = vec_madd(v6,v1,vzero); 170 v5 = vec_nmsub(v7,v1,vzero); 171 vbs9 = vec_perm(v2,v4,vperm3); 172 vbs11 = vec_perm(v2,v4,vperm4); 173 vbs10 = vec_perm(v3,v5,vperm3); 174 vbs12 = vec_perm(v3,v5,vperm4); 175 176 v2 = vec_perm(vbs5,vbs7,vperm1); 177 v3 = vec_perm(vbs6,vbs8,vperm1); 178 v4 = vec_perm(vbs5,vbs7,vperm2); 179 v5 = vec_perm(vbs6,vbs8,vperm2); 180 v6 = vec_sub(v2,v4); 181 v7 = vec_sub(v3,v5); 182 v2 = vec_add(v2,v4); 183 v3 = vec_add(v3,v5); 184 v4 = vec_madd(v6,v1,vzero); 185 v5 = vec_nmsub(v7,v1,vzero); 186 vbs13 = vec_perm(v2,v4,vperm3); 187 vbs15 = vec_perm(v2,v4,vperm4); 188 vbs14 = vec_perm(v3,v5,vperm3); 189 vbs16 = vec_perm(v3,v5,vperm4); 190 191 192 costab = pnts[4]; 193 194 v1 = vec_lde(0,costab); 195 #ifdef __APPLE__ 196 v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f); 197 #else 198 v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f}; 199 #endif 200 v3 = vec_splat(v1,0); 201 v1 = vec_madd(v2,v3,vzero); 202 203 v2 = vec_mergeh(vbs9,vbs10); 204 v3 = vec_mergel(vbs9,vbs10); 205 v4 = vec_mergeh(vbs11,vbs12); 206 v5 = vec_mergel(vbs11,vbs12); 207 v6 = vec_mergeh(v2,v3); 208 v7 = vec_mergel(v2,v3); 209 v2 = vec_mergeh(v4,v5); 210 v3 = vec_mergel(v4,v5); 211 v4 = vec_sub(v6,v7); 212 v5 = vec_sub(v2,v3); 213 v6 = vec_add(v6,v7); 214 v7 = vec_add(v2,v3); 215 v2 = vec_madd(v4,v1,vzero); 216 v3 = vec_madd(v5,v1,vzero); 217 vbs1 = vec_mergeh(v6,v2); 218 vbs2 = vec_mergel(v6,v2); 219 vbs3 = vec_mergeh(v7,v3); 220 vbs4 = vec_mergel(v7,v3); 221 222 v2 = vec_mergeh(vbs13,vbs14); 223 v3 = vec_mergel(vbs13,vbs14); 224 v4 = vec_mergeh(vbs15,vbs16); 225 v5 = vec_mergel(vbs15,vbs16); 226 v6 = vec_mergeh(v2,v3); 227 v7 = vec_mergel(v2,v3); 228 v2 = vec_mergeh(v4,v5); 229 v3 = vec_mergel(v4,v5); 230 v4 = vec_sub(v6,v7); 231 v5 = vec_sub(v2,v3); 232 v6 = vec_add(v6,v7); 233 v7 = vec_add(v2,v3); 234 v2 = vec_madd(v4,v1,vzero); 235 v3 = vec_madd(v5,v1,vzero); 236 vbs5 = vec_mergeh(v6,v2); 237 vbs6 = vec_mergel(v6,v2); 238 vbs7 = vec_mergeh(v7,v3); 239 vbs8 = vec_mergel(v7,v3); 240 241 vec_st(vbs1,0,bufs); 242 vec_st(vbs2,16,bufs); 243 vec_st(vbs3,32,bufs); 244 vec_st(vbs4,48,bufs); 245 vec_st(vbs5,64,bufs); 246 vec_st(vbs6,80,bufs); 247 vec_st(vbs7,96,bufs); 248 vec_st(vbs8,112,bufs); 249 } 250 251 { 252 register real *b1; 253 register int i; 254 255 for(b1=bufs,i=8;i;i--,b1+=4) 256 b1[2] += b1[3]; 257 258 for(b1=bufs,i=4;i;i--,b1+=8) 259 { 260 b1[4] += b1[6]; 261 b1[6] += b1[5]; 262 b1[5] += b1[7]; 263 } 264 265 for(b1=bufs,i=2;i;i--,b1+=16) 266 { 267 b1[8] += b1[12]; 268 b1[12] += b1[10]; 269 b1[10] += b1[14]; 270 b1[14] += b1[9]; 271 b1[9] += b1[13]; 272 b1[13] += b1[11]; 273 b1[11] += b1[15]; 274 } 275 } 276 277 278 out0[0x10*16] = bufs[0]; 279 out0[0x10*15] = bufs[16+0] + bufs[16+8]; 280 out0[0x10*14] = bufs[8]; 281 out0[0x10*13] = bufs[16+8] + bufs[16+4]; 282 out0[0x10*12] = bufs[4]; 283 out0[0x10*11] = bufs[16+4] + bufs[16+12]; 284 out0[0x10*10] = bufs[12]; 285 out0[0x10* 9] = bufs[16+12] + bufs[16+2]; 286 out0[0x10* 8] = bufs[2]; 287 out0[0x10* 7] = bufs[16+2] + bufs[16+10]; 288 out0[0x10* 6] = bufs[10]; 289 out0[0x10* 5] = bufs[16+10] + bufs[16+6]; 290 out0[0x10* 4] = bufs[6]; 291 out0[0x10* 3] = bufs[16+6] + bufs[16+14]; 292 out0[0x10* 2] = bufs[14]; 293 out0[0x10* 1] = bufs[16+14] + bufs[16+1]; 294 out0[0x10* 0] = bufs[1]; 295 296 out1[0x10* 0] = bufs[1]; 297 out1[0x10* 1] = bufs[16+1] + bufs[16+9]; 298 out1[0x10* 2] = bufs[9]; 299 out1[0x10* 3] = bufs[16+9] + bufs[16+5]; 300 out1[0x10* 4] = bufs[5]; 301 out1[0x10* 5] = bufs[16+5] + bufs[16+13]; 302 out1[0x10* 6] = bufs[13]; 303 out1[0x10* 7] = bufs[16+13] + bufs[16+3]; 304 out1[0x10* 8] = bufs[3]; 305 out1[0x10* 9] = bufs[16+3] + bufs[16+11]; 306 out1[0x10*10] = bufs[11]; 307 out1[0x10*11] = bufs[16+11] + bufs[16+7]; 308 out1[0x10*12] = bufs[7]; 309 out1[0x10*13] = bufs[16+7] + bufs[16+15]; 310 out1[0x10*14] = bufs[15]; 311 out1[0x10*15] = bufs[16+15]; 312 313 } 314 315 316