1 /* 2 3 Copyright (C) 2014, The University of Texas at Austin 4 5 This file is part of libflame and is available under the 3-Clause 6 BSD license, which can be found in the LICENSE file at the top-level 7 directory, or at http://opensource.org/licenses/BSD-3-Clause 8 9 */ 10 11 12 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 13 14 #define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops 15 #define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd 16 #define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc 17 #define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz 18 19 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 20 21 #define MAC_Apply_G_mx3_ass( m_A, \ 22 gamma12, \ 23 sigma12, \ 24 gamma23, \ 25 sigma23, \ 26 a1, inc_a1, \ 27 a2, inc_a2, \ 28 a3, inc_a3 ) \ 29 {\ 30 int n_iter32 = m_A / ( 4 * 8 ); \ 31 int n_left32 = m_A % ( 4 * 8 ); \ 32 int n_iter4 = n_left32 / ( 4 * 1 ); \ 33 int n_left = n_left32 % ( 4 * 1 ); \ 34 int i; \ 35 \ 36 const int step_a1 = inc_a1 * 4; \ 37 const int step_a2 = inc_a1 * 4; \ 38 const int step_a3 = inc_a1 * 4; \ 39 \ 40 float* restrict alpha1 = a1; \ 41 float* restrict alpha2 = a2; \ 42 float* restrict alpha3 = a3; \ 43 \ 44 v4sf_t a1v, a2v, a3v; \ 45 v4sf_t g12v, s12v; \ 46 v4sf_t g23v, s23v; \ 47 v4sf_t t1v, t2v; \ 48 \ 49 g12v.v = _mm_load1_ps( gamma12 ); \ 50 s12v.v = _mm_load1_ps( sigma12 ); \ 51 g23v.v = _mm_load1_ps( gamma23 ); \ 52 s23v.v = _mm_load1_ps( sigma23 ); \ 53 \ 54 for ( i = 0; i < n_iter32; ++i ) \ 55 { \ 56 \ 57 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 58 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 59 \ 60 t1v.v = a1v.v; \ 61 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 62 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 63 \ 64 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 65 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 66 alpha1 += step_a1; \ 67 \ 68 t2v.v = a2v.v; \ 69 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 70 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 71 \ 72 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 73 alpha2 += step_a2; \ 74 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 75 alpha3 += step_a3; \ 76 \ 77 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 78 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 79 \ 80 t1v.v = a1v.v; \ 81 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 82 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 83 \ 84 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 85 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 86 alpha1 += step_a1; \ 87 \ 88 t2v.v = a2v.v; \ 89 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 90 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 91 \ 92 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 93 alpha2 += step_a2; \ 94 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 95 alpha3 += step_a3; \ 96 \ 97 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 98 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 99 \ 100 t1v.v = a1v.v; \ 101 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 102 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 103 \ 104 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 105 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 106 alpha1 += step_a1; \ 107 \ 108 t2v.v = a2v.v; \ 109 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 110 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 111 \ 112 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 113 alpha2 += step_a2; \ 114 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 115 alpha3 += step_a3; \ 116 \ 117 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 118 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 119 \ 120 t1v.v = a1v.v; \ 121 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 122 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 123 \ 124 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 125 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 126 alpha1 += step_a1; \ 127 \ 128 t2v.v = a2v.v; \ 129 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 130 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 131 \ 132 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 133 alpha2 += step_a2; \ 134 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 135 alpha3 += step_a3; \ 136 \ 137 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 138 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 139 \ 140 t1v.v = a1v.v; \ 141 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 142 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 143 \ 144 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 145 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 146 alpha1 += step_a1; \ 147 \ 148 t2v.v = a2v.v; \ 149 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 150 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 151 \ 152 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 153 alpha2 += step_a2; \ 154 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 155 alpha3 += step_a3; \ 156 \ 157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 159 \ 160 t1v.v = a1v.v; \ 161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 163 \ 164 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 165 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 166 alpha1 += step_a1; \ 167 \ 168 t2v.v = a2v.v; \ 169 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 170 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 171 \ 172 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 173 alpha2 += step_a2; \ 174 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 175 alpha3 += step_a3; \ 176 \ 177 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 178 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 179 \ 180 t1v.v = a1v.v; \ 181 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 182 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 183 \ 184 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 185 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 186 alpha1 += step_a1; \ 187 \ 188 t2v.v = a2v.v; \ 189 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 190 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 191 \ 192 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 193 alpha2 += step_a2; \ 194 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 195 alpha3 += step_a3; \ 196 \ 197 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 198 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 199 \ 200 t1v.v = a1v.v; \ 201 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 202 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 203 \ 204 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 205 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 206 alpha1 += step_a1; \ 207 \ 208 t2v.v = a2v.v; \ 209 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 210 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 211 \ 212 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 213 alpha2 += step_a2; \ 214 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 215 alpha3 += step_a3; \ 216 } \ 217 \ 218 for ( i = 0; i < n_iter4; ++i ) \ 219 { \ 220 \ 221 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 222 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 223 \ 224 t1v.v = a1v.v; \ 225 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 226 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 227 \ 228 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 229 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 230 alpha1 += step_a1; \ 231 \ 232 t2v.v = a2v.v; \ 233 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 234 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 235 \ 236 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 237 alpha2 += step_a2; \ 238 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 239 alpha3 += step_a3; \ 240 } \ 241 \ 242 for ( i = 0; i < n_left; ++i ) \ 243 { \ 244 float ga12 = *gamma12; \ 245 float si12 = *sigma12; \ 246 float ga23 = *gamma23; \ 247 float si23 = *sigma23; \ 248 float temp1; \ 249 float temp2; \ 250 float temp3; \ 251 \ 252 temp1 = *alpha1; \ 253 temp2 = *alpha2; \ 254 \ 255 *alpha1 = temp1 * ga12 + temp2 * si12; \ 256 *alpha2 = temp2 * ga12 - temp1 * si12; \ 257 \ 258 temp2 = *alpha2; \ 259 temp3 = *alpha3; \ 260 \ 261 *alpha2 = temp2 * ga23 + temp3 * si23; \ 262 *alpha3 = temp3 * ga23 - temp2 * si23; \ 263 \ 264 alpha1 += 1; \ 265 alpha2 += 1; \ 266 alpha3 += 1; \ 267 } \ 268 } 269 270 #define MAC_Apply_G_mx3_asd( m_A, \ 271 gamma12, \ 272 sigma12, \ 273 gamma23, \ 274 sigma23, \ 275 a1, inc_a1, \ 276 a2, inc_a2, \ 277 a3, inc_a3 ) \ 278 {\ 279 int n_iter16 = m_A / ( 2 * 8 ); \ 280 int n_left16 = m_A % ( 2 * 8 ); \ 281 int n_iter2 = n_left16 / ( 2 * 1 ); \ 282 int n_left = n_left16 % ( 2 * 1 ); \ 283 int i; \ 284 \ 285 const int step_a1 = inc_a1 * 2; \ 286 const int step_a2 = inc_a1 * 2; \ 287 const int step_a3 = inc_a1 * 2; \ 288 \ 289 double* restrict alpha1 = a1; \ 290 double* restrict alpha2 = a2; \ 291 double* restrict alpha3 = a3; \ 292 \ 293 v2df_t a1v, a2v, a3v; \ 294 v2df_t g12v, s12v; \ 295 v2df_t g23v, s23v; \ 296 v2df_t t1v, t2v; \ 297 \ 298 g12v.v = _mm_loaddup_pd( gamma12 ); \ 299 s12v.v = _mm_loaddup_pd( sigma12 ); \ 300 g23v.v = _mm_loaddup_pd( gamma23 ); \ 301 s23v.v = _mm_loaddup_pd( sigma23 ); \ 302 \ 303 for ( i = 0; i < n_iter16; ++i ) \ 304 { \ 305 \ 306 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 307 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 308 \ 309 t1v.v = a1v.v; \ 310 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 311 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 312 \ 313 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 314 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 315 alpha1 += step_a1; \ 316 \ 317 t2v.v = a2v.v; \ 318 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 319 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 320 \ 321 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 322 alpha2 += step_a2; \ 323 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 324 alpha3 += step_a3; \ 325 \ 326 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 327 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 328 \ 329 t1v.v = a1v.v; \ 330 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 331 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 332 \ 333 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 334 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 335 alpha1 += step_a1; \ 336 \ 337 t2v.v = a2v.v; \ 338 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 339 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 340 \ 341 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 342 alpha2 += step_a2; \ 343 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 344 alpha3 += step_a3; \ 345 \ 346 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 347 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 348 \ 349 t1v.v = a1v.v; \ 350 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 351 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 352 \ 353 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 354 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 355 alpha1 += step_a1; \ 356 \ 357 t2v.v = a2v.v; \ 358 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 359 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 360 \ 361 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 362 alpha2 += step_a2; \ 363 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 364 alpha3 += step_a3; \ 365 \ 366 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 367 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 368 \ 369 t1v.v = a1v.v; \ 370 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 371 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 372 \ 373 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 374 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 375 alpha1 += step_a1; \ 376 \ 377 t2v.v = a2v.v; \ 378 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 379 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 380 \ 381 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 382 alpha2 += step_a2; \ 383 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 384 alpha3 += step_a3; \ 385 \ 386 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 387 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 388 \ 389 t1v.v = a1v.v; \ 390 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 391 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 392 \ 393 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 394 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 395 alpha1 += step_a1; \ 396 \ 397 t2v.v = a2v.v; \ 398 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 399 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 400 \ 401 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 402 alpha2 += step_a2; \ 403 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 404 alpha3 += step_a3; \ 405 \ 406 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 407 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 408 \ 409 t1v.v = a1v.v; \ 410 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 411 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 412 \ 413 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 414 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 415 alpha1 += step_a1; \ 416 \ 417 t2v.v = a2v.v; \ 418 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 419 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 420 \ 421 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 422 alpha2 += step_a2; \ 423 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 424 alpha3 += step_a3; \ 425 \ 426 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 427 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 428 \ 429 t1v.v = a1v.v; \ 430 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 431 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 432 \ 433 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 434 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 435 alpha1 += step_a1; \ 436 \ 437 t2v.v = a2v.v; \ 438 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 439 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 440 \ 441 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 442 alpha2 += step_a2; \ 443 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 444 alpha3 += step_a3; \ 445 \ 446 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 447 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 448 \ 449 t1v.v = a1v.v; \ 450 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 451 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 452 \ 453 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 454 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 455 alpha1 += step_a1; \ 456 \ 457 t2v.v = a2v.v; \ 458 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 459 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 460 \ 461 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 462 alpha2 += step_a2; \ 463 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 464 alpha3 += step_a3; \ 465 } \ 466 \ 467 for ( i = 0; i < n_iter2; ++i ) \ 468 { \ 469 \ 470 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 471 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 472 \ 473 t1v.v = a1v.v; \ 474 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 475 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 476 \ 477 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 478 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 479 alpha1 += step_a1; \ 480 \ 481 t2v.v = a2v.v; \ 482 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 483 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 484 \ 485 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 486 alpha2 += step_a2; \ 487 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 488 alpha3 += step_a3; \ 489 } \ 490 \ 491 if ( n_left == 1 ) \ 492 { \ 493 double ga12 = *gamma12; \ 494 double si12 = *sigma12; \ 495 double ga23 = *gamma23; \ 496 double si23 = *sigma23; \ 497 double temp1; \ 498 double temp2; \ 499 double temp3; \ 500 \ 501 temp1 = *alpha1; \ 502 temp2 = *alpha2; \ 503 \ 504 *alpha1 = temp1 * ga12 + temp2 * si12; \ 505 *alpha2 = temp2 * ga12 - temp1 * si12; \ 506 \ 507 temp2 = *alpha2; \ 508 temp3 = *alpha3; \ 509 \ 510 *alpha2 = temp2 * ga23 + temp3 * si23; \ 511 *alpha3 = temp3 * ga23 - temp2 * si23; \ 512 } \ 513 } 514 515 #define MAC_Apply_G_mx3_asc( m_A, \ 516 gamma12, \ 517 sigma12, \ 518 gamma23, \ 519 sigma23, \ 520 a1, inc_a1, \ 521 a2, inc_a2, \ 522 a3, inc_a3 ) \ 523 { \ 524 int n_iter16 = m_A / ( 2 * 8 ); \ 525 int n_left16 = m_A % ( 2 * 8 ); \ 526 int n_iter2 = n_left16 / ( 2 * 1 ); \ 527 int n_left = n_left16 % ( 2 * 1 ); \ 528 int i; \ 529 \ 530 const int step_a1 = inc_a1 * 2; \ 531 const int step_a2 = inc_a1 * 2; \ 532 const int step_a3 = inc_a1 * 2; \ 533 \ 534 scomplex* restrict alpha1 = a1; \ 535 scomplex* restrict alpha2 = a2; \ 536 scomplex* restrict alpha3 = a3; \ 537 \ 538 v4sf_t a1v, a2v, a3v; \ 539 v4sf_t g12v, s12v; \ 540 v4sf_t g23v, s23v; \ 541 v4sf_t t1v, t2v; \ 542 \ 543 g12v.v = _mm_load1_ps( gamma12 ); \ 544 s12v.v = _mm_load1_ps( sigma12 ); \ 545 g23v.v = _mm_load1_ps( gamma23 ); \ 546 s23v.v = _mm_load1_ps( sigma23 ); \ 547 \ 548 for ( i = 0; i < n_iter16; ++i ) \ 549 { \ 550 \ 551 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 552 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 553 \ 554 t1v.v = a1v.v; \ 555 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 556 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 557 \ 558 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 559 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 560 alpha1 += step_a1; \ 561 \ 562 t2v.v = a2v.v; \ 563 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 564 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 565 \ 566 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 567 alpha2 += step_a2; \ 568 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 569 alpha3 += step_a3; \ 570 \ 571 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 572 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 573 \ 574 t1v.v = a1v.v; \ 575 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 576 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 577 \ 578 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 579 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 580 alpha1 += step_a1; \ 581 \ 582 t2v.v = a2v.v; \ 583 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 584 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 585 \ 586 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 587 alpha2 += step_a2; \ 588 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 589 alpha3 += step_a3; \ 590 \ 591 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 592 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 593 \ 594 t1v.v = a1v.v; \ 595 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 596 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 597 \ 598 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 599 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 600 alpha1 += step_a1; \ 601 \ 602 t2v.v = a2v.v; \ 603 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 604 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 605 \ 606 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 607 alpha2 += step_a2; \ 608 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 609 alpha3 += step_a3; \ 610 \ 611 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 612 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 613 \ 614 t1v.v = a1v.v; \ 615 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 616 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 617 \ 618 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 619 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 620 alpha1 += step_a1; \ 621 \ 622 t2v.v = a2v.v; \ 623 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 624 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 625 \ 626 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 627 alpha2 += step_a2; \ 628 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 629 alpha3 += step_a3; \ 630 \ 631 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 632 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 633 \ 634 t1v.v = a1v.v; \ 635 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 636 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 637 \ 638 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 639 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 640 alpha1 += step_a1; \ 641 \ 642 t2v.v = a2v.v; \ 643 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 644 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 645 \ 646 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 647 alpha2 += step_a2; \ 648 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 649 alpha3 += step_a3; \ 650 \ 651 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 652 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 653 \ 654 t1v.v = a1v.v; \ 655 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 656 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 657 \ 658 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 659 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 660 alpha1 += step_a1; \ 661 \ 662 t2v.v = a2v.v; \ 663 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 664 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 665 \ 666 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 667 alpha2 += step_a2; \ 668 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 669 alpha3 += step_a3; \ 670 \ 671 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 672 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 673 \ 674 t1v.v = a1v.v; \ 675 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 676 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 677 \ 678 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 679 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 680 alpha1 += step_a1; \ 681 \ 682 t2v.v = a2v.v; \ 683 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 684 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 685 \ 686 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 687 alpha2 += step_a2; \ 688 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 689 alpha3 += step_a3; \ 690 \ 691 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 692 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 693 \ 694 t1v.v = a1v.v; \ 695 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 696 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 697 \ 698 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 699 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 700 alpha1 += step_a1; \ 701 \ 702 t2v.v = a2v.v; \ 703 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 704 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 705 \ 706 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 707 alpha2 += step_a2; \ 708 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 709 alpha3 += step_a3; \ 710 } \ 711 \ 712 for ( i = 0; i < n_iter2; ++i ) \ 713 { \ 714 \ 715 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 716 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 717 \ 718 t1v.v = a1v.v; \ 719 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 720 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 721 \ 722 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 723 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 724 alpha1 += step_a1; \ 725 \ 726 t2v.v = a2v.v; \ 727 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 728 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 729 \ 730 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 731 alpha2 += step_a2; \ 732 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 733 alpha3 += step_a3; \ 734 } \ 735 \ 736 if ( n_left == 1 ) \ 737 { \ 738 float ga12 = *gamma12; \ 739 float si12 = *sigma12; \ 740 float ga23 = *gamma23; \ 741 float si23 = *sigma23; \ 742 scomplex temp1; \ 743 scomplex temp2; \ 744 scomplex temp3; \ 745 \ 746 temp1 = *alpha1; \ 747 temp2 = *alpha2; \ 748 \ 749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 751 \ 752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 754 \ 755 temp2 = *alpha2; \ 756 temp3 = *alpha3; \ 757 \ 758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \ 759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \ 760 \ 761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \ 762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \ 763 } \ 764 } 765 766 #define MAC_Apply_G_mx3_asz( m_A, \ 767 gamma12, \ 768 sigma12, \ 769 gamma23, \ 770 sigma23, \ 771 a1, inc_a1, \ 772 a2, inc_a2, \ 773 a3, inc_a3 ) \ 774 {\ 775 int n_iter = m_A / 8; \ 776 int n_left = m_A % 8; \ 777 int i; \ 778 \ 779 const int step_a1 = inc_a1 * 1; \ 780 const int step_a2 = inc_a1 * 1; \ 781 const int step_a3 = inc_a1 * 1; \ 782 \ 783 dcomplex* restrict alpha1 = a1; \ 784 dcomplex* restrict alpha2 = a2; \ 785 dcomplex* restrict alpha3 = a3; \ 786 \ 787 v2df_t a1v, a2v, a3v; \ 788 v2df_t g12v, s12v; \ 789 v2df_t g23v, s23v; \ 790 v2df_t t1v, t2v; \ 791 \ 792 g12v.v = _mm_loaddup_pd( gamma12 ); \ 793 s12v.v = _mm_loaddup_pd( sigma12 ); \ 794 g23v.v = _mm_loaddup_pd( gamma23 ); \ 795 s23v.v = _mm_loaddup_pd( sigma23 ); \ 796 \ 797 for ( i = 0; i < n_iter; ++i ) \ 798 { \ 799 \ 800 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 801 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 802 \ 803 t1v.v = a1v.v; \ 804 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 805 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 806 \ 807 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 808 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 809 alpha1 += step_a1; \ 810 \ 811 t2v.v = a2v.v; \ 812 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 813 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 814 \ 815 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 816 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 817 alpha2 += step_a2; \ 818 alpha3 += step_a3; \ 819 \ 820 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 821 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 822 \ 823 t1v.v = a1v.v; \ 824 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 825 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 826 \ 827 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 828 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 829 alpha1 += step_a1; \ 830 \ 831 t2v.v = a2v.v; \ 832 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 833 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 834 \ 835 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 836 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 837 alpha2 += step_a2; \ 838 alpha3 += step_a3; \ 839 \ 840 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 841 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 842 \ 843 t1v.v = a1v.v; \ 844 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 845 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 846 \ 847 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 848 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 849 alpha1 += step_a1; \ 850 \ 851 t2v.v = a2v.v; \ 852 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 853 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 854 \ 855 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 856 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 857 alpha2 += step_a2; \ 858 alpha3 += step_a3; \ 859 \ 860 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 861 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 862 \ 863 t1v.v = a1v.v; \ 864 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 865 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 866 \ 867 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 868 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 869 alpha1 += step_a1; \ 870 \ 871 t2v.v = a2v.v; \ 872 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 873 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 874 \ 875 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 876 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 877 alpha2 += step_a2; \ 878 alpha3 += step_a3; \ 879 \ 880 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 881 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 882 \ 883 t1v.v = a1v.v; \ 884 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 885 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 886 \ 887 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 888 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 889 alpha1 += step_a1; \ 890 \ 891 t2v.v = a2v.v; \ 892 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 893 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 894 \ 895 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 896 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 897 alpha2 += step_a2; \ 898 alpha3 += step_a3; \ 899 \ 900 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 901 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 902 \ 903 t1v.v = a1v.v; \ 904 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 905 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 906 \ 907 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 908 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 909 alpha1 += step_a1; \ 910 \ 911 t2v.v = a2v.v; \ 912 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 913 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 914 \ 915 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 916 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 917 alpha2 += step_a2; \ 918 alpha3 += step_a3; \ 919 \ 920 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 921 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 922 \ 923 t1v.v = a1v.v; \ 924 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 925 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 926 \ 927 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 928 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 929 alpha1 += step_a1; \ 930 \ 931 t2v.v = a2v.v; \ 932 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 933 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 934 \ 935 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 936 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 937 alpha2 += step_a2; \ 938 alpha3 += step_a3; \ 939 \ 940 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 941 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 942 \ 943 t1v.v = a1v.v; \ 944 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 945 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 946 \ 947 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 948 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 949 alpha1 += step_a1; \ 950 \ 951 t2v.v = a2v.v; \ 952 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 953 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 954 \ 955 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 956 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 957 alpha2 += step_a2; \ 958 alpha3 += step_a3; \ 959 } \ 960 \ 961 for ( i = 0; i < n_left; ++i ) \ 962 { \ 963 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 965 \ 966 t1v.v = a1v.v; \ 967 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 968 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 969 \ 970 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 971 alpha1 += step_a1; \ 972 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 973 \ 974 t2v.v = a2v.v; \ 975 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 976 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 977 \ 978 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 979 alpha2 += step_a2; \ 980 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 981 alpha3 += step_a3; \ 982 } \ 983 } 984 985 #endif 986