1/*************************************************************************** 2Copyright (c) 2013-2016, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2016/04/22 Werner Saar (wernsaar@googlemail.com) 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* LAPACK-TEST : OK 34**************************************************************************************/ 35 36#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 37 38 #define XSFADD_R1 xsadddp 39 #define XSFADD_R2 xssubdp 40 #define XSFADD_I1 xsadddp 41 #define XSFADD_I2 xsadddp 42 43#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) 44 45 #define XSFADD_R1 xsadddp 46 #define XSFADD_R2 xsadddp 47 #define XSFADD_I1 xssubdp 48 #define XSFADD_I2 xsadddp 49 50#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) 51 52 #define XSFADD_R1 xsadddp 53 #define XSFADD_R2 xsadddp 54 #define XSFADD_I1 xsadddp 55 #define XSFADD_I2 xssubdp 56 57#else // CC || CR || RC || RR 58 59 #define XSFADD_R1 xsadddp 60 #define XSFADD_R2 xssubdp 61 #define XSFADD_I1 xssubdp 62 #define XSFADD_I2 xssubdp 63 64#endif 65 66/********************************************************************************************** 67* Macros for N=2 and M=8 68**********************************************************************************************/ 69 70#if defined(_AIX) 71define(`LOAD2x8_1', ` 72#else 73.macro LOAD2x8_1 74#endif 75 76 lxvd2x vs16, o0, BO // load real part from B 77 lxvd2x vs17, o16, BO // load imag part from B 78 lxvd2x vs18, o32, BO // load real part from B 79 lxvd2x vs19, o48, BO // load imag part from B 80 81 addi BO, BO, 64 82 83 lxvd2x vs0, o0, AO // load real,imag from A 84 lxvd2x vs1, o16, AO // load real,imag from A 85 lxvd2x vs2, o32, AO // load real,imag from A 86 lxvd2x vs3, o48, AO // load real,imag from A 87 88 addi AO, AO, 64 89 90 lxvd2x vs4, o0, AO // load real,imag from A 91 lxvd2x vs5, o16, AO // load real,imag from A 92 lxvd2x vs6, o32, AO // load real,imag from A 93 lxvd2x vs7, o48, AO // load real,imag from A 94 95 addi AO, AO, 64 96 97 98#if defined(_AIX) 99') 100#else 101.endm 102#endif 103 104#if defined(_AIX) 105define(`KERNEL2x8_I1', ` 106#else 107.macro KERNEL2x8_I1 108#endif 109 110 lxvd2x vs8, o0, AO // load real,imag from A 111 lxvd2x vs9, o16, AO // load real,imag from A 112 lxvd2x vs10, o32, AO // load real,imag from A 113 lxvd2x vs11, o48, AO // load real,imag from A 114 115 addi AO, AO, 64 116 117 lxvd2x vs12, o0, AO // load real,imag from A 118 lxvd2x vs13, o16, AO // load real,imag from A 119 lxvd2x vs14, o32, AO // load real,imag from A 120 lxvd2x vs15, o48, AO // load real,imag from A 121 122 addi AO, AO, 64 123 124 lxvd2x vs20, o0, BO // load real part from B 125 lxvd2x vs21, o16, BO // load imag part from B 126 lxvd2x vs22, o32, BO // load real part from B 127 lxvd2x vs23, o48, BO // load imag part from B 128 129 addi BO, BO, 64 130 131 xvmuldp vs32, vs0, vs16 // real*real, imag*real 132 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 133 xvmuldp vs34, vs1, vs16 // real*real, imag*real 134 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 135 xvmuldp vs36, vs2, vs16 // real*real, imag*real 136 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 137 xvmuldp vs38, vs3, vs16 // real*real, imag*real 138 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 139 xvmuldp vs40, vs4, vs16 // real*real, imag*real 140 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag 141 xvmuldp vs42, vs5, vs16 // real*real, imag*real 142 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag 143 xvmuldp vs44, vs6, vs16 // real*real, imag*real 144 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag 145 xvmuldp vs46, vs7, vs16 // real*real, imag*real 146 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag 147 148 xvmuldp vs48, vs0, vs18 // real*real, imag*real 149 xvmuldp vs49, vs0, vs19 // real*imag, imag*imag 150 xvmuldp vs50, vs1, vs18 // real*real, imag*real 151 xvmuldp vs51, vs1, vs19 // real*imag, imag*imag 152 xvmuldp vs52, vs2, vs18 // real*real, imag*real 153 xvmuldp vs53, vs2, vs19 // real*imag, imag*imag 154 xvmuldp vs54, vs3, vs18 // real*real, imag*real 155 xvmuldp vs55, vs3, vs19 // real*imag, imag*imag 156 xvmuldp vs56, vs4, vs18 // real*real, imag*real 157 xvmuldp vs57, vs4, vs19 // real*imag, imag*imag 158 xvmuldp vs58, vs5, vs18 // real*real, imag*real 159 xvmuldp vs59, vs5, vs19 // real*imag, imag*imag 160 xvmuldp vs60, vs6, vs18 // real*real, imag*real 161 xvmuldp vs61, vs6, vs19 // real*imag, imag*imag 162 xvmuldp vs62, vs7, vs18 // real*real, imag*real 163 xvmuldp vs63, vs7, vs19 // real*imag, imag*imag 164 165 166#if defined(_AIX) 167') 168#else 169.endm 170#endif 171 172#if defined(_AIX) 173define(`KERNEL2x8_1', ` 174#else 175.macro KERNEL2x8_1 176#endif 177 178 lxvd2x vs8, o0, AO // load real,imag from A 179 lxvd2x vs9, o16, AO // load real,imag from A 180 lxvd2x vs10, o32, AO // load real,imag from A 181 lxvd2x vs11, o48, AO // load real,imag from A 182 183 addi AO, AO, 64 184 185 lxvd2x vs12, o0, AO // load real,imag from A 186 lxvd2x vs13, o16, AO // load real,imag from A 187 lxvd2x vs14, o32, AO // load real,imag from A 188 lxvd2x vs15, o48, AO // load real,imag from A 189 190 addi AO, AO, 64 191 192 lxvd2x vs20, o0, BO // load real part from B 193 lxvd2x vs21, o16, BO // load imag part from B 194 lxvd2x vs22, o32, BO // load real part from B 195 lxvd2x vs23, o48, BO // load imag part from B 196 197 addi BO, BO, 64 198 199 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 200 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 201 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 202 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 203 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 204 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 205 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 206 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 207 xvmaddadp vs40, vs4, vs16 // real*real, imag*real 208 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag 209 xvmaddadp vs42, vs5, vs16 // real*real, imag*real 210 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag 211 xvmaddadp vs44, vs6, vs16 // real*real, imag*real 212 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag 213 xvmaddadp vs46, vs7, vs16 // real*real, imag*real 214 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag 215 216 xvmaddadp vs48, vs0, vs18 // real*real, imag*real 217 xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag 218 xvmaddadp vs50, vs1, vs18 // real*real, imag*real 219 xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag 220 xvmaddadp vs52, vs2, vs18 // real*real, imag*real 221 xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag 222 xvmaddadp vs54, vs3, vs18 // real*real, imag*real 223 xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag 224 xvmaddadp vs56, vs4, vs18 // real*real, imag*real 225 xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag 226 xvmaddadp vs58, vs5, vs18 // real*real, imag*real 227 xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag 228 xvmaddadp vs60, vs6, vs18 // real*real, imag*real 229 xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag 230 xvmaddadp vs62, vs7, vs18 // real*real, imag*real 231 xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag 232 233 234#if defined(_AIX) 235') 236#else 237.endm 238#endif 239 240#if defined(_AIX) 241define(`KERNEL2x8_2', ` 242#else 243.macro KERNEL2x8_2 244#endif 245 246 lxvd2x vs0, o0, AO // load real,imag from A 247 lxvd2x vs1, o16, AO // load real,imag from A 248 lxvd2x vs2, o32, AO // load real,imag from A 249 lxvd2x vs3, o48, AO // load real,imag from A 250 251 addi AO, AO, 64 252 253 lxvd2x vs4, o0, AO // load real,imag from A 254 lxvd2x vs5, o16, AO // load real,imag from A 255 lxvd2x vs6, o32, AO // load real,imag from A 256 lxvd2x vs7, o48, AO // load real,imag from A 257 258 addi AO, AO, 64 259 260 lxvd2x vs16, o0, BO // load real part from B 261 lxvd2x vs17, o16, BO // load imag part from B 262 lxvd2x vs18, o32, BO // load real part from B 263 lxvd2x vs19, o48, BO // load imag part from B 264 265 addi BO, BO, 64 266 267 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 268 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 269 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 270 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 271 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 272 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 273 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 274 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 275 xvmaddadp vs40, vs12, vs20 // real*real, imag*real 276 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag 277 xvmaddadp vs42, vs13, vs20 // real*real, imag*real 278 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag 279 xvmaddadp vs44, vs14, vs20 // real*real, imag*real 280 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag 281 xvmaddadp vs46, vs15, vs20 // real*real, imag*real 282 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag 283 284 xvmaddadp vs48, vs8, vs22 // real*real, imag*real 285 xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag 286 xvmaddadp vs50, vs9, vs22 // real*real, imag*real 287 xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag 288 xvmaddadp vs52, vs10, vs22 // real*real, imag*real 289 xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag 290 xvmaddadp vs54, vs11, vs22 // real*real, imag*real 291 xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag 292 xvmaddadp vs56, vs12, vs22 // real*real, imag*real 293 xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag 294 xvmaddadp vs58, vs13, vs22 // real*real, imag*real 295 xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag 296 xvmaddadp vs60, vs14, vs22 // real*real, imag*real 297 xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag 298 xvmaddadp vs62, vs15, vs22 // real*real, imag*real 299 xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag 300 301 302#if defined(_AIX) 303') 304#else 305.endm 306#endif 307 308#if defined(_AIX) 309define(`KERNEL2x8_E2', ` 310#else 311.macro KERNEL2x8_E2 312#endif 313 314 315 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 316 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 317 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 318 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 319 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 320 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 321 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 322 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 323 xvmaddadp vs40, vs12, vs20 // real*real, imag*real 324 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag 325 xvmaddadp vs42, vs13, vs20 // real*real, imag*real 326 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag 327 xvmaddadp vs44, vs14, vs20 // real*real, imag*real 328 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag 329 xvmaddadp vs46, vs15, vs20 // real*real, imag*real 330 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag 331 332 xvmaddadp vs48, vs8, vs22 // real*real, imag*real 333 xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag 334 xvmaddadp vs50, vs9, vs22 // real*real, imag*real 335 xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag 336 xvmaddadp vs52, vs10, vs22 // real*real, imag*real 337 xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag 338 xvmaddadp vs54, vs11, vs22 // real*real, imag*real 339 xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag 340 xvmaddadp vs56, vs12, vs22 // real*real, imag*real 341 xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag 342 xvmaddadp vs58, vs13, vs22 // real*real, imag*real 343 xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag 344 xvmaddadp vs60, vs14, vs22 // real*real, imag*real 345 xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag 346 xvmaddadp vs62, vs15, vs22 // real*real, imag*real 347 xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag 348 349 350#if defined(_AIX) 351') 352#else 353.endm 354#endif 355 356#if defined(_AIX) 357define(`KERNEL2x8_SUBI1', ` 358#else 359.macro KERNEL2x8_SUBI1 360#endif 361 362 lxvd2x vs0, o0, AO // load real,imag from A 363 lxvd2x vs1, o16, AO // load real,imag from A 364 lxvd2x vs2, o32, AO // load real,imag from A 365 lxvd2x vs3, o48, AO // load real,imag from A 366 367 addi AO, AO, 64 368 369 lxvd2x vs4, o0, AO // load real,imag from A 370 lxvd2x vs5, o16, AO // load real,imag from A 371 lxvd2x vs6, o32, AO // load real,imag from A 372 lxvd2x vs7, o48, AO // load real,imag from A 373 374 addi AO, AO, 64 375 376 lxvd2x vs16, o0, BO // load real part from B 377 lxvd2x vs17, o16, BO // load imag part from B 378 lxvd2x vs18, o32, BO // load real part from B 379 lxvd2x vs19, o48, BO // load imag part from B 380 381 addi BO, BO, 64 382 383 xvmuldp vs32, vs0, vs16 // real*real, imag*real 384 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 385 xvmuldp vs34, vs1, vs16 // real*real, imag*real 386 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 387 xvmuldp vs36, vs2, vs16 // real*real, imag*real 388 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 389 xvmuldp vs38, vs3, vs16 // real*real, imag*real 390 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 391 xvmuldp vs40, vs4, vs16 // real*real, imag*real 392 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag 393 xvmuldp vs42, vs5, vs16 // real*real, imag*real 394 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag 395 xvmuldp vs44, vs6, vs16 // real*real, imag*real 396 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag 397 xvmuldp vs46, vs7, vs16 // real*real, imag*real 398 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag 399 400 xvmuldp vs48, vs0, vs18 // real*real, imag*real 401 xvmuldp vs49, vs0, vs19 // real*imag, imag*imag 402 xvmuldp vs50, vs1, vs18 // real*real, imag*real 403 xvmuldp vs51, vs1, vs19 // real*imag, imag*imag 404 xvmuldp vs52, vs2, vs18 // real*real, imag*real 405 xvmuldp vs53, vs2, vs19 // real*imag, imag*imag 406 xvmuldp vs54, vs3, vs18 // real*real, imag*real 407 xvmuldp vs55, vs3, vs19 // real*imag, imag*imag 408 xvmuldp vs56, vs4, vs18 // real*real, imag*real 409 xvmuldp vs57, vs4, vs19 // real*imag, imag*imag 410 xvmuldp vs58, vs5, vs18 // real*real, imag*real 411 xvmuldp vs59, vs5, vs19 // real*imag, imag*imag 412 xvmuldp vs60, vs6, vs18 // real*real, imag*real 413 xvmuldp vs61, vs6, vs19 // real*imag, imag*imag 414 xvmuldp vs62, vs7, vs18 // real*real, imag*real 415 xvmuldp vs63, vs7, vs19 // real*imag, imag*imag 416 417 418#if defined(_AIX) 419') 420#else 421.endm 422#endif 423 424#if defined(_AIX) 425define(`KERNEL2x8_SUB1', ` 426#else 427.macro KERNEL2x8_SUB1 428#endif 429 430 lxvd2x vs0, o0, AO // load real,imag from A 431 lxvd2x vs1, o16, AO // load real,imag from A 432 lxvd2x vs2, o32, AO // load real,imag from A 433 lxvd2x vs3, o48, AO // load real,imag from A 434 435 addi AO, AO, 64 436 437 lxvd2x vs4, o0, AO // load real,imag from A 438 lxvd2x vs5, o16, AO // load real,imag from A 439 lxvd2x vs6, o32, AO // load real,imag from A 440 lxvd2x vs7, o48, AO // load real,imag from A 441 442 addi AO, AO, 64 443 444 lxvd2x vs16, o0, BO // load real part from B 445 lxvd2x vs17, o16, BO // load imag part from B 446 lxvd2x vs18, o32, BO // load real part from B 447 lxvd2x vs19, o48, BO // load imag part from B 448 449 addi BO, BO, 64 450 451 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 452 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 453 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 454 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 455 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 456 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 457 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 458 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 459 xvmaddadp vs40, vs4, vs16 // real*real, imag*real 460 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag 461 xvmaddadp vs42, vs5, vs16 // real*real, imag*real 462 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag 463 xvmaddadp vs44, vs6, vs16 // real*real, imag*real 464 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag 465 xvmaddadp vs46, vs7, vs16 // real*real, imag*real 466 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag 467 468 xvmaddadp vs48, vs0, vs18 // real*real, imag*real 469 xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag 470 xvmaddadp vs50, vs1, vs18 // real*real, imag*real 471 xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag 472 xvmaddadp vs52, vs2, vs18 // real*real, imag*real 473 xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag 474 xvmaddadp vs54, vs3, vs18 // real*real, imag*real 475 xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag 476 xvmaddadp vs56, vs4, vs18 // real*real, imag*real 477 xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag 478 xvmaddadp vs58, vs5, vs18 // real*real, imag*real 479 xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag 480 xvmaddadp vs60, vs6, vs18 // real*real, imag*real 481 xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag 482 xvmaddadp vs62, vs7, vs18 // real*real, imag*real 483 xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag 484 485 486#if defined(_AIX) 487') 488#else 489.endm 490#endif 491 492#if defined(_AIX) 493define(`SAVE2x8', ` 494#else 495.macro SAVE2x8 496#endif 497 498 499 mr T1, CO 500 addi T2, T1, 64 501 502#ifndef TRMMKERNEL 503 504 lxvd2x vs16, o0, T1 505 lxvd2x vs17, o16, T1 506 lxvd2x vs18, o32, T1 507 lxvd2x vs19, o48, T1 508 lxvd2x vs20, o0, T2 509 lxvd2x vs21, o16, T2 510 lxvd2x vs22, o32, T2 511 lxvd2x vs23, o48, T2 512 513#endif 514 515 516 xxlxor vs0, vs0, vs0 517 xxlxor vs1, vs1, vs1 518 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 519 520 XSFADD_R1 vs0, vs0, vs32 // realA*realB 521 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 522 523 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 524 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 525 526 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 527 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 528 529 xsmuldp vs4, vs0, alpha_r // real*alpha_r 530 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 531 xsmuldp vs6, vs0, alpha_i // real*alpha_i 532 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 533 534 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 535 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 536 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 537 538 539 540 xxlxor vs0, vs0, vs0 541 xxlxor vs1, vs1, vs1 542 XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 543 544 XSFADD_R1 vs0, vs0, vs34 // realA*realB 545 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB 546 547 XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB 548 XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 549 550 XSFADD_I1 vs1, vs1, vs34 // realA*imagB 551 XSFADD_I2 vs1, vs1, vs35 // imagA*realB 552 553 xsmuldp vs4, vs0, alpha_r // real*alpha_r 554 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 555 xsmuldp vs6, vs0, alpha_i // real*alpha_i 556 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 557 558 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 559 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 560 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 561 562 563 564 xxlxor vs0, vs0, vs0 565 xxlxor vs1, vs1, vs1 566 XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 567 568 XSFADD_R1 vs0, vs0, vs36 // realA*realB 569 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB 570 571 XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB 572 XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 573 574 XSFADD_I1 vs1, vs1, vs36 // realA*imagB 575 XSFADD_I2 vs1, vs1, vs37 // imagA*realB 576 577 xsmuldp vs4, vs0, alpha_r // real*alpha_r 578 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 579 xsmuldp vs6, vs0, alpha_i // real*alpha_i 580 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 581 582 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 583 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 584 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part 585 586 587 588 xxlxor vs0, vs0, vs0 589 xxlxor vs1, vs1, vs1 590 XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 591 592 XSFADD_R1 vs0, vs0, vs38 // realA*realB 593 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB 594 595 XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB 596 XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 597 598 XSFADD_I1 vs1, vs1, vs38 // realA*imagB 599 XSFADD_I2 vs1, vs1, vs39 // imagA*realB 600 601 xsmuldp vs4, vs0, alpha_r // real*alpha_r 602 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 603 xsmuldp vs6, vs0, alpha_i // real*alpha_i 604 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 605 606 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 607 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 608 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part 609 610 611 612 xxlxor vs0, vs0, vs0 613 xxlxor vs1, vs1, vs1 614 XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 615 616 XSFADD_R1 vs0, vs0, vs40 // realA*realB 617 XSFADD_R2 vs0, vs0, vs41 // imagA*imagB 618 619 XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB 620 XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 621 622 XSFADD_I1 vs1, vs1, vs40 // realA*imagB 623 XSFADD_I2 vs1, vs1, vs41 // imagA*realB 624 625 xsmuldp vs4, vs0, alpha_r // real*alpha_r 626 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 627 xsmuldp vs6, vs0, alpha_i // real*alpha_i 628 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 629 630 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 631 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 632 xxpermdi vs12, vs2, vs3, 0 // merge real and imag part 633 634 635 636 xxlxor vs0, vs0, vs0 637 xxlxor vs1, vs1, vs1 638 XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 639 640 XSFADD_R1 vs0, vs0, vs42 // realA*realB 641 XSFADD_R2 vs0, vs0, vs43 // imagA*imagB 642 643 XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB 644 XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 645 646 XSFADD_I1 vs1, vs1, vs42 // realA*imagB 647 XSFADD_I2 vs1, vs1, vs43 // imagA*realB 648 649 xsmuldp vs4, vs0, alpha_r // real*alpha_r 650 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 651 xsmuldp vs6, vs0, alpha_i // real*alpha_i 652 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 653 654 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 655 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 656 xxpermdi vs13, vs2, vs3, 0 // merge real and imag part 657 658 659 660 xxlxor vs0, vs0, vs0 661 xxlxor vs1, vs1, vs1 662 XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 663 664 XSFADD_R1 vs0, vs0, vs44 // realA*realB 665 XSFADD_R2 vs0, vs0, vs45 // imagA*imagB 666 667 XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB 668 XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 669 670 XSFADD_I1 vs1, vs1, vs44 // realA*imagB 671 XSFADD_I2 vs1, vs1, vs45 // imagA*realB 672 673 xsmuldp vs4, vs0, alpha_r // real*alpha_r 674 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 675 xsmuldp vs6, vs0, alpha_i // real*alpha_i 676 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 677 678 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 679 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 680 xxpermdi vs14, vs2, vs3, 0 // merge real and imag part 681 682 683 684 xxlxor vs0, vs0, vs0 685 xxlxor vs1, vs1, vs1 686 XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 687 688 XSFADD_R1 vs0, vs0, vs46 // realA*realB 689 XSFADD_R2 vs0, vs0, vs47 // imagA*imagB 690 691 XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB 692 XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 693 694 XSFADD_I1 vs1, vs1, vs46 // realA*imagB 695 XSFADD_I2 vs1, vs1, vs47 // imagA*realB 696 697 xsmuldp vs4, vs0, alpha_r // real*alpha_r 698 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 699 xsmuldp vs6, vs0, alpha_i // real*alpha_i 700 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 701 702 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 703 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 704 xxpermdi vs15, vs2, vs3, 0 // merge real and imag part 705 706 707#ifndef TRMMKERNEL 708 709 xvadddp vs8, vs8, vs16 710 xvadddp vs9, vs9, vs17 711 xvadddp vs10, vs10, vs18 712 xvadddp vs11, vs11, vs19 713 xvadddp vs12, vs12, vs20 714 xvadddp vs13, vs13, vs21 715 xvadddp vs14, vs14, vs22 716 xvadddp vs15, vs15, vs23 717 718#endif 719 720 stxvd2x vs8, o0, T1 721 stxvd2x vs9, o16, T1 722 stxvd2x vs10, o32, T1 723 stxvd2x vs11, o48, T1 724 stxvd2x vs12, o0, T2 725 stxvd2x vs13, o16, T2 726 stxvd2x vs14, o32, T2 727 stxvd2x vs15, o48, T2 728 729 add T1, T1, LDC 730 add T2, T2, LDC 731 732#ifndef TRMMKERNEL 733 734 lxvd2x vs16, o0, T1 735 lxvd2x vs17, o16, T1 736 lxvd2x vs18, o32, T1 737 lxvd2x vs19, o48, T1 738 lxvd2x vs20, o0, T2 739 lxvd2x vs21, o16, T2 740 lxvd2x vs22, o32, T2 741 lxvd2x vs23, o48, T2 742 743#endif 744 745 746 xxlxor vs0, vs0, vs0 747 xxlxor vs1, vs1, vs1 748 XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 749 750 XSFADD_R1 vs0, vs0, vs48 // realA*realB 751 XSFADD_R2 vs0, vs0, vs49 // imagA*imagB 752 753 XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB 754 XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 755 756 XSFADD_I1 vs1, vs1, vs48 // realA*imagB 757 XSFADD_I2 vs1, vs1, vs49 // imagA*realB 758 759 xsmuldp vs4, vs0, alpha_r // real*alpha_r 760 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 761 xsmuldp vs6, vs0, alpha_i // real*alpha_i 762 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 763 764 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 765 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 766 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 767 768 769 770 xxlxor vs0, vs0, vs0 771 xxlxor vs1, vs1, vs1 772 XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 773 774 XSFADD_R1 vs0, vs0, vs50 // realA*realB 775 XSFADD_R2 vs0, vs0, vs51 // imagA*imagB 776 777 XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB 778 XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 779 780 XSFADD_I1 vs1, vs1, vs50 // realA*imagB 781 XSFADD_I2 vs1, vs1, vs51 // imagA*realB 782 783 xsmuldp vs4, vs0, alpha_r // real*alpha_r 784 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 785 xsmuldp vs6, vs0, alpha_i // real*alpha_i 786 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 787 788 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 789 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 790 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 791 792 793 794 xxlxor vs0, vs0, vs0 795 xxlxor vs1, vs1, vs1 796 XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 797 798 XSFADD_R1 vs0, vs0, vs52 // realA*realB 799 XSFADD_R2 vs0, vs0, vs53 // imagA*imagB 800 801 XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB 802 XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 803 804 XSFADD_I1 vs1, vs1, vs52 // realA*imagB 805 XSFADD_I2 vs1, vs1, vs53 // imagA*realB 806 807 xsmuldp vs4, vs0, alpha_r // real*alpha_r 808 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 809 xsmuldp vs6, vs0, alpha_i // real*alpha_i 810 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 811 812 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 813 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 814 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part 815 816 817 818 xxlxor vs0, vs0, vs0 819 xxlxor vs1, vs1, vs1 820 XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 821 822 XSFADD_R1 vs0, vs0, vs54 // realA*realB 823 XSFADD_R2 vs0, vs0, vs55 // imagA*imagB 824 825 XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB 826 XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 827 828 XSFADD_I1 vs1, vs1, vs54 // realA*imagB 829 XSFADD_I2 vs1, vs1, vs55 // imagA*realB 830 831 xsmuldp vs4, vs0, alpha_r // real*alpha_r 832 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 833 xsmuldp vs6, vs0, alpha_i // real*alpha_i 834 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 835 836 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 837 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 838 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part 839 840 841 842 xxlxor vs0, vs0, vs0 843 xxlxor vs1, vs1, vs1 844 XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 845 846 XSFADD_R1 vs0, vs0, vs56 // realA*realB 847 XSFADD_R2 vs0, vs0, vs57 // imagA*imagB 848 849 XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB 850 XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 851 852 XSFADD_I1 vs1, vs1, vs56 // realA*imagB 853 XSFADD_I2 vs1, vs1, vs57 // imagA*realB 854 855 xsmuldp vs4, vs0, alpha_r // real*alpha_r 856 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 857 xsmuldp vs6, vs0, alpha_i // real*alpha_i 858 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 859 860 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 861 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 862 xxpermdi vs12, vs2, vs3, 0 // merge real and imag part 863 864 865 866 xxlxor vs0, vs0, vs0 867 xxlxor vs1, vs1, vs1 868 XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 869 870 XSFADD_R1 vs0, vs0, vs58 // realA*realB 871 XSFADD_R2 vs0, vs0, vs59 // imagA*imagB 872 873 XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB 874 XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 875 876 XSFADD_I1 vs1, vs1, vs58 // realA*imagB 877 XSFADD_I2 vs1, vs1, vs59 // imagA*realB 878 879 xsmuldp vs4, vs0, alpha_r // real*alpha_r 880 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 881 xsmuldp vs6, vs0, alpha_i // real*alpha_i 882 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 883 884 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 885 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 886 xxpermdi vs13, vs2, vs3, 0 // merge real and imag part 887 888 889 890 xxlxor vs0, vs0, vs0 891 xxlxor vs1, vs1, vs1 892 XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 893 894 XSFADD_R1 vs0, vs0, vs60 // realA*realB 895 XSFADD_R2 vs0, vs0, vs61 // imagA*imagB 896 897 XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB 898 XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 899 900 XSFADD_I1 vs1, vs1, vs60 // realA*imagB 901 XSFADD_I2 vs1, vs1, vs61 // imagA*realB 902 903 xsmuldp vs4, vs0, alpha_r // real*alpha_r 904 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 905 xsmuldp vs6, vs0, alpha_i // real*alpha_i 906 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 907 908 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 909 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 910 xxpermdi vs14, vs2, vs3, 0 // merge real and imag part 911 912 913 914 xxlxor vs0, vs0, vs0 915 xxlxor vs1, vs1, vs1 916 XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 917 918 XSFADD_R1 vs0, vs0, vs62 // realA*realB 919 XSFADD_R2 vs0, vs0, vs63 // imagA*imagB 920 921 XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB 922 XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 923 924 XSFADD_I1 vs1, vs1, vs62 // realA*imagB 925 XSFADD_I2 vs1, vs1, vs63 // imagA*realB 926 927 xsmuldp vs4, vs0, alpha_r // real*alpha_r 928 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 929 xsmuldp vs6, vs0, alpha_i // real*alpha_i 930 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 931 932 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 933 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 934 xxpermdi vs15, vs2, vs3, 0 // merge real and imag part 935 936 937#ifndef TRMMKERNEL 938 939 xvadddp vs8, vs8, vs16 940 xvadddp vs9, vs9, vs17 941 xvadddp vs10, vs10, vs18 942 xvadddp vs11, vs11, vs19 943 xvadddp vs12, vs12, vs20 944 xvadddp vs13, vs13, vs21 945 xvadddp vs14, vs14, vs22 946 xvadddp vs15, vs15, vs23 947 948#endif 949 950 stxvd2x vs8, o0, T1 951 stxvd2x vs9, o16, T1 952 stxvd2x vs10, o32, T1 953 stxvd2x vs11, o48, T1 954 stxvd2x vs12, o0, T2 955 stxvd2x vs13, o16, T2 956 stxvd2x vs14, o32, T2 957 stxvd2x vs15, o48, T2 958 959 add T1, T1, LDC 960 add T2, T2, LDC 961 addi CO, CO, 128 962 963#if defined(_AIX) 964') 965#else 966.endm 967#endif 968 969 970/********************************************************************************************** 971* Macros for N=2 and M=4 972**********************************************************************************************/ 973 974#if defined(_AIX) 975define(`LOAD2x4_1', ` 976#else 977.macro LOAD2x4_1 978#endif 979 980 lxvd2x vs16, o0, BO // load real part from B 981 lxvd2x vs17, o16, BO // load imag part from B 982 lxvd2x vs18, o32, BO // load real part from B 983 lxvd2x vs19, o48, BO // load imag part from B 984 985 addi BO, BO, 64 986 987 lxvd2x vs0, o0, AO // load real,imag from A 988 lxvd2x vs1, o16, AO // load real,imag from A 989 lxvd2x vs2, o32, AO // load real,imag from A 990 lxvd2x vs3, o48, AO // load real,imag from A 991 992 addi AO, AO, 64 993 994 995#if defined(_AIX) 996') 997#else 998.endm 999#endif 1000 1001#if defined(_AIX) 1002define(`KERNEL2x4_I1', ` 1003#else 1004.macro KERNEL2x4_I1 1005#endif 1006 1007 lxvd2x vs8, o0, AO // load real,imag from A 1008 lxvd2x vs9, o16, AO // load real,imag from A 1009 lxvd2x vs10, o32, AO // load real,imag from A 1010 lxvd2x vs11, o48, AO // load real,imag from A 1011 1012 addi AO, AO, 64 1013 1014 lxvd2x vs20, o0, BO // load real part from B 1015 lxvd2x vs21, o16, BO // load imag part from B 1016 lxvd2x vs22, o32, BO // load real part from B 1017 lxvd2x vs23, o48, BO // load imag part from B 1018 1019 addi BO, BO, 64 1020 1021 xvmuldp vs32, vs0, vs16 // real*real, imag*real 1022 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 1023 xvmuldp vs34, vs1, vs16 // real*real, imag*real 1024 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 1025 xvmuldp vs36, vs2, vs16 // real*real, imag*real 1026 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 1027 xvmuldp vs38, vs3, vs16 // real*real, imag*real 1028 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 1029 1030 xvmuldp vs40, vs0, vs18 // real*real, imag*real 1031 xvmuldp vs41, vs0, vs19 // real*imag, imag*imag 1032 xvmuldp vs42, vs1, vs18 // real*real, imag*real 1033 xvmuldp vs43, vs1, vs19 // real*imag, imag*imag 1034 xvmuldp vs44, vs2, vs18 // real*real, imag*real 1035 xvmuldp vs45, vs2, vs19 // real*imag, imag*imag 1036 xvmuldp vs46, vs3, vs18 // real*real, imag*real 1037 xvmuldp vs47, vs3, vs19 // real*imag, imag*imag 1038 1039 1040#if defined(_AIX) 1041') 1042#else 1043.endm 1044#endif 1045 1046#if defined(_AIX) 1047define(`KERNEL2x4_1', ` 1048#else 1049.macro KERNEL2x4_1 1050#endif 1051 1052 lxvd2x vs8, o0, AO // load real,imag from A 1053 lxvd2x vs9, o16, AO // load real,imag from A 1054 lxvd2x vs10, o32, AO // load real,imag from A 1055 lxvd2x vs11, o48, AO // load real,imag from A 1056 1057 addi AO, AO, 64 1058 1059 lxvd2x vs20, o0, BO // load real part from B 1060 lxvd2x vs21, o16, BO // load imag part from B 1061 lxvd2x vs22, o32, BO // load real part from B 1062 lxvd2x vs23, o48, BO // load imag part from B 1063 1064 addi BO, BO, 64 1065 1066 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 1067 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 1068 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 1069 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 1070 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 1071 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 1072 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 1073 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 1074 1075 xvmaddadp vs40, vs0, vs18 // real*real, imag*real 1076 xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag 1077 xvmaddadp vs42, vs1, vs18 // real*real, imag*real 1078 xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag 1079 xvmaddadp vs44, vs2, vs18 // real*real, imag*real 1080 xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag 1081 xvmaddadp vs46, vs3, vs18 // real*real, imag*real 1082 xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag 1083 1084 1085#if defined(_AIX) 1086') 1087#else 1088.endm 1089#endif 1090 1091#if defined(_AIX) 1092define(`KERNEL2x4_2', ` 1093#else 1094.macro KERNEL2x4_2 1095#endif 1096 1097 lxvd2x vs0, o0, AO // load real,imag from A 1098 lxvd2x vs1, o16, AO // load real,imag from A 1099 lxvd2x vs2, o32, AO // load real,imag from A 1100 lxvd2x vs3, o48, AO // load real,imag from A 1101 1102 addi AO, AO, 64 1103 1104 lxvd2x vs16, o0, BO // load real part from B 1105 lxvd2x vs17, o16, BO // load imag part from B 1106 lxvd2x vs18, o32, BO // load real part from B 1107 lxvd2x vs19, o48, BO // load imag part from B 1108 1109 addi BO, BO, 64 1110 1111 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 1112 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 1113 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 1114 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 1115 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 1116 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 1117 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 1118 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 1119 1120 xvmaddadp vs40, vs8, vs22 // real*real, imag*real 1121 xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag 1122 xvmaddadp vs42, vs9, vs22 // real*real, imag*real 1123 xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag 1124 xvmaddadp vs44, vs10, vs22 // real*real, imag*real 1125 xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag 1126 xvmaddadp vs46, vs11, vs22 // real*real, imag*real 1127 xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag 1128 1129 1130#if defined(_AIX) 1131') 1132#else 1133.endm 1134#endif 1135 1136#if defined(_AIX) 1137define(`KERNEL2x4_E2', ` 1138#else 1139.macro KERNEL2x4_E2 1140#endif 1141 1142 1143 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 1144 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 1145 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 1146 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 1147 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 1148 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 1149 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 1150 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 1151 1152 xvmaddadp vs40, vs8, vs22 // real*real, imag*real 1153 xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag 1154 xvmaddadp vs42, vs9, vs22 // real*real, imag*real 1155 xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag 1156 xvmaddadp vs44, vs10, vs22 // real*real, imag*real 1157 xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag 1158 xvmaddadp vs46, vs11, vs22 // real*real, imag*real 1159 xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag 1160 1161 1162#if defined(_AIX) 1163') 1164#else 1165.endm 1166#endif 1167 1168#if defined(_AIX) 1169define(`KERNEL2x4_SUBI1', ` 1170#else 1171.macro KERNEL2x4_SUBI1 1172#endif 1173 1174 lxvd2x vs0, o0, AO // load real,imag from A 1175 lxvd2x vs1, o16, AO // load real,imag from A 1176 lxvd2x vs2, o32, AO // load real,imag from A 1177 lxvd2x vs3, o48, AO // load real,imag from A 1178 1179 addi AO, AO, 64 1180 1181 lxvd2x vs16, o0, BO // load real part from B 1182 lxvd2x vs17, o16, BO // load imag part from B 1183 lxvd2x vs18, o32, BO // load real part from B 1184 lxvd2x vs19, o48, BO // load imag part from B 1185 1186 addi BO, BO, 64 1187 1188 xvmuldp vs32, vs0, vs16 // real*real, imag*real 1189 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 1190 xvmuldp vs34, vs1, vs16 // real*real, imag*real 1191 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 1192 xvmuldp vs36, vs2, vs16 // real*real, imag*real 1193 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 1194 xvmuldp vs38, vs3, vs16 // real*real, imag*real 1195 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 1196 1197 xvmuldp vs40, vs0, vs18 // real*real, imag*real 1198 xvmuldp vs41, vs0, vs19 // real*imag, imag*imag 1199 xvmuldp vs42, vs1, vs18 // real*real, imag*real 1200 xvmuldp vs43, vs1, vs19 // real*imag, imag*imag 1201 xvmuldp vs44, vs2, vs18 // real*real, imag*real 1202 xvmuldp vs45, vs2, vs19 // real*imag, imag*imag 1203 xvmuldp vs46, vs3, vs18 // real*real, imag*real 1204 xvmuldp vs47, vs3, vs19 // real*imag, imag*imag 1205 1206 1207#if defined(_AIX) 1208') 1209#else 1210.endm 1211#endif 1212 1213#if defined(_AIX) 1214define(`KERNEL2x4_SUB1', ` 1215#else 1216.macro KERNEL2x4_SUB1 1217#endif 1218 1219 lxvd2x vs0, o0, AO // load real,imag from A 1220 lxvd2x vs1, o16, AO // load real,imag from A 1221 lxvd2x vs2, o32, AO // load real,imag from A 1222 lxvd2x vs3, o48, AO // load real,imag from A 1223 1224 addi AO, AO, 64 1225 1226 lxvd2x vs16, o0, BO // load real part from B 1227 lxvd2x vs17, o16, BO // load imag part from B 1228 lxvd2x vs18, o32, BO // load real part from B 1229 lxvd2x vs19, o48, BO // load imag part from B 1230 1231 addi BO, BO, 64 1232 1233 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 1234 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 1235 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 1236 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 1237 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 1238 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 1239 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 1240 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 1241 1242 xvmaddadp vs40, vs0, vs18 // real*real, imag*real 1243 xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag 1244 xvmaddadp vs42, vs1, vs18 // real*real, imag*real 1245 xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag 1246 xvmaddadp vs44, vs2, vs18 // real*real, imag*real 1247 xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag 1248 xvmaddadp vs46, vs3, vs18 // real*real, imag*real 1249 xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag 1250 1251 1252#if defined(_AIX) 1253') 1254#else 1255.endm 1256#endif 1257 1258#if defined(_AIX) 1259define(`SAVE2x4', ` 1260#else 1261.macro SAVE2x4 1262#endif 1263 1264 1265 mr T1, CO 1266 1267#ifndef TRMMKERNEL 1268 1269 lxvd2x vs16, o0, T1 1270 lxvd2x vs17, o16, T1 1271 lxvd2x vs18, o32, T1 1272 lxvd2x vs19, o48, T1 1273 1274#endif 1275 1276 1277 xxlxor vs0, vs0, vs0 1278 xxlxor vs1, vs1, vs1 1279 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1280 1281 XSFADD_R1 vs0, vs0, vs32 // realA*realB 1282 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 1283 1284 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1285 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1286 1287 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 1288 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 1289 1290 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1291 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1292 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1293 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1294 1295 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1296 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1297 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 1298 1299 1300 1301 xxlxor vs0, vs0, vs0 1302 xxlxor vs1, vs1, vs1 1303 XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1304 1305 XSFADD_R1 vs0, vs0, vs34 // realA*realB 1306 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB 1307 1308 XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1309 XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1310 1311 XSFADD_I1 vs1, vs1, vs34 // realA*imagB 1312 XSFADD_I2 vs1, vs1, vs35 // imagA*realB 1313 1314 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1315 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1316 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1317 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1318 1319 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1320 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1321 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 1322 1323 1324 1325 xxlxor vs0, vs0, vs0 1326 xxlxor vs1, vs1, vs1 1327 XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1328 1329 XSFADD_R1 vs0, vs0, vs36 // realA*realB 1330 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB 1331 1332 XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1333 XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1334 1335 XSFADD_I1 vs1, vs1, vs36 // realA*imagB 1336 XSFADD_I2 vs1, vs1, vs37 // imagA*realB 1337 1338 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1339 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1340 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1341 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1342 1343 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1344 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1345 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part 1346 1347 1348 1349 xxlxor vs0, vs0, vs0 1350 xxlxor vs1, vs1, vs1 1351 XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1352 1353 XSFADD_R1 vs0, vs0, vs38 // realA*realB 1354 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB 1355 1356 XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1357 XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1358 1359 XSFADD_I1 vs1, vs1, vs38 // realA*imagB 1360 XSFADD_I2 vs1, vs1, vs39 // imagA*realB 1361 1362 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1363 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1364 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1365 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1366 1367 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1368 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1369 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part 1370 1371 1372#ifndef TRMMKERNEL 1373 1374 xvadddp vs8, vs8, vs16 1375 xvadddp vs9, vs9, vs17 1376 xvadddp vs10, vs10, vs18 1377 xvadddp vs11, vs11, vs19 1378 1379#endif 1380 1381 stxvd2x vs8, o0, T1 1382 stxvd2x vs9, o16, T1 1383 stxvd2x vs10, o32, T1 1384 stxvd2x vs11, o48, T1 1385 1386 add T1, T1, LDC 1387 1388#ifndef TRMMKERNEL 1389 1390 lxvd2x vs16, o0, T1 1391 lxvd2x vs17, o16, T1 1392 lxvd2x vs18, o32, T1 1393 lxvd2x vs19, o48, T1 1394 1395#endif 1396 1397 1398 xxlxor vs0, vs0, vs0 1399 xxlxor vs1, vs1, vs1 1400 XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1401 1402 XSFADD_R1 vs0, vs0, vs40 // realA*realB 1403 XSFADD_R2 vs0, vs0, vs41 // imagA*imagB 1404 1405 XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1406 XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1407 1408 XSFADD_I1 vs1, vs1, vs40 // realA*imagB 1409 XSFADD_I2 vs1, vs1, vs41 // imagA*realB 1410 1411 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1412 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1413 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1414 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1415 1416 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1417 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1418 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 1419 1420 1421 1422 xxlxor vs0, vs0, vs0 1423 xxlxor vs1, vs1, vs1 1424 XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1425 1426 XSFADD_R1 vs0, vs0, vs42 // realA*realB 1427 XSFADD_R2 vs0, vs0, vs43 // imagA*imagB 1428 1429 XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1430 XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1431 1432 XSFADD_I1 vs1, vs1, vs42 // realA*imagB 1433 XSFADD_I2 vs1, vs1, vs43 // imagA*realB 1434 1435 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1436 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1437 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1438 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1439 1440 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1441 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1442 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 1443 1444 1445 1446 xxlxor vs0, vs0, vs0 1447 xxlxor vs1, vs1, vs1 1448 XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1449 1450 XSFADD_R1 vs0, vs0, vs44 // realA*realB 1451 XSFADD_R2 vs0, vs0, vs45 // imagA*imagB 1452 1453 XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1454 XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1455 1456 XSFADD_I1 vs1, vs1, vs44 // realA*imagB 1457 XSFADD_I2 vs1, vs1, vs45 // imagA*realB 1458 1459 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1460 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1461 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1462 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1463 1464 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1465 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1466 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part 1467 1468 1469 1470 xxlxor vs0, vs0, vs0 1471 xxlxor vs1, vs1, vs1 1472 XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1473 1474 XSFADD_R1 vs0, vs0, vs46 // realA*realB 1475 XSFADD_R2 vs0, vs0, vs47 // imagA*imagB 1476 1477 XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1478 XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1479 1480 XSFADD_I1 vs1, vs1, vs46 // realA*imagB 1481 XSFADD_I2 vs1, vs1, vs47 // imagA*realB 1482 1483 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1484 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1485 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1486 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1487 1488 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1489 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1490 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part 1491 1492 1493#ifndef TRMMKERNEL 1494 1495 xvadddp vs8, vs8, vs16 1496 xvadddp vs9, vs9, vs17 1497 xvadddp vs10, vs10, vs18 1498 xvadddp vs11, vs11, vs19 1499 1500#endif 1501 1502 stxvd2x vs8, o0, T1 1503 stxvd2x vs9, o16, T1 1504 stxvd2x vs10, o32, T1 1505 stxvd2x vs11, o48, T1 1506 1507 add T1, T1, LDC 1508 addi CO, CO, 64 1509 1510#if defined(_AIX) 1511') 1512#else 1513.endm 1514#endif 1515 1516 1517/********************************************************************************************** 1518* Macros for N=2 and M=2 1519**********************************************************************************************/ 1520 1521#if defined(_AIX) 1522define(`LOAD2x2_1', ` 1523#else 1524.macro LOAD2x2_1 1525#endif 1526 1527 lxvd2x vs16, o0, BO // load real part from B 1528 lxvd2x vs17, o16, BO // load imag part from B 1529 lxvd2x vs18, o32, BO // load real part from B 1530 lxvd2x vs19, o48, BO // load imag part from B 1531 1532 addi BO, BO, 64 1533 1534 lxvd2x vs0, o0, AO // load real,imag from A 1535 lxvd2x vs1, o16, AO // load real,imag from A 1536 1537 addi AO, AO, 32 1538 1539 1540#if defined(_AIX) 1541') 1542#else 1543.endm 1544#endif 1545 1546#if defined(_AIX) 1547define(`KERNEL2x2_I1', ` 1548#else 1549.macro KERNEL2x2_I1 1550#endif 1551 1552 lxvd2x vs8, o0, AO // load real,imag from A 1553 lxvd2x vs9, o16, AO // load real,imag from A 1554 1555 addi AO, AO, 32 1556 1557 lxvd2x vs20, o0, BO // load real part from B 1558 lxvd2x vs21, o16, BO // load imag part from B 1559 lxvd2x vs22, o32, BO // load real part from B 1560 lxvd2x vs23, o48, BO // load imag part from B 1561 1562 addi BO, BO, 64 1563 1564 xvmuldp vs32, vs0, vs16 // real*real, imag*real 1565 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 1566 xvmuldp vs34, vs1, vs16 // real*real, imag*real 1567 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 1568 1569 xvmuldp vs36, vs0, vs18 // real*real, imag*real 1570 xvmuldp vs37, vs0, vs19 // real*imag, imag*imag 1571 xvmuldp vs38, vs1, vs18 // real*real, imag*real 1572 xvmuldp vs39, vs1, vs19 // real*imag, imag*imag 1573 1574 1575#if defined(_AIX) 1576') 1577#else 1578.endm 1579#endif 1580 1581#if defined(_AIX) 1582define(`KERNEL2x2_1', ` 1583#else 1584.macro KERNEL2x2_1 1585#endif 1586 1587 lxvd2x vs8, o0, AO // load real,imag from A 1588 lxvd2x vs9, o16, AO // load real,imag from A 1589 1590 addi AO, AO, 32 1591 1592 lxvd2x vs20, o0, BO // load real part from B 1593 lxvd2x vs21, o16, BO // load imag part from B 1594 lxvd2x vs22, o32, BO // load real part from B 1595 lxvd2x vs23, o48, BO // load imag part from B 1596 1597 addi BO, BO, 64 1598 1599 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 1600 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 1601 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 1602 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 1603 1604 xvmaddadp vs36, vs0, vs18 // real*real, imag*real 1605 xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag 1606 xvmaddadp vs38, vs1, vs18 // real*real, imag*real 1607 xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag 1608 1609 1610#if defined(_AIX) 1611') 1612#else 1613.endm 1614#endif 1615 1616#if defined(_AIX) 1617define(`KERNEL2x2_2', ` 1618#else 1619.macro KERNEL2x2_2 1620#endif 1621 1622 lxvd2x vs0, o0, AO // load real,imag from A 1623 lxvd2x vs1, o16, AO // load real,imag from A 1624 1625 addi AO, AO, 32 1626 1627 lxvd2x vs16, o0, BO // load real part from B 1628 lxvd2x vs17, o16, BO // load imag part from B 1629 lxvd2x vs18, o32, BO // load real part from B 1630 lxvd2x vs19, o48, BO // load imag part from B 1631 1632 addi BO, BO, 64 1633 1634 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 1635 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 1636 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 1637 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 1638 1639 xvmaddadp vs36, vs8, vs22 // real*real, imag*real 1640 xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag 1641 xvmaddadp vs38, vs9, vs22 // real*real, imag*real 1642 xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag 1643 1644 1645#if defined(_AIX) 1646') 1647#else 1648.endm 1649#endif 1650 1651#if defined(_AIX) 1652define(`KERNEL2x2_E2', ` 1653#else 1654.macro KERNEL2x2_E2 1655#endif 1656 1657 1658 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 1659 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 1660 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 1661 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 1662 1663 xvmaddadp vs36, vs8, vs22 // real*real, imag*real 1664 xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag 1665 xvmaddadp vs38, vs9, vs22 // real*real, imag*real 1666 xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag 1667 1668 1669#if defined(_AIX) 1670') 1671#else 1672.endm 1673#endif 1674 1675#if defined(_AIX) 1676define(`KERNEL2x2_SUBI1', ` 1677#else 1678.macro KERNEL2x2_SUBI1 1679#endif 1680 1681 lxvd2x vs0, o0, AO // load real,imag from A 1682 lxvd2x vs1, o16, AO // load real,imag from A 1683 1684 addi AO, AO, 32 1685 1686 lxvd2x vs16, o0, BO // load real part from B 1687 lxvd2x vs17, o16, BO // load imag part from B 1688 lxvd2x vs18, o32, BO // load real part from B 1689 lxvd2x vs19, o48, BO // load imag part from B 1690 1691 addi BO, BO, 64 1692 1693 xvmuldp vs32, vs0, vs16 // real*real, imag*real 1694 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 1695 xvmuldp vs34, vs1, vs16 // real*real, imag*real 1696 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 1697 1698 xvmuldp vs36, vs0, vs18 // real*real, imag*real 1699 xvmuldp vs37, vs0, vs19 // real*imag, imag*imag 1700 xvmuldp vs38, vs1, vs18 // real*real, imag*real 1701 xvmuldp vs39, vs1, vs19 // real*imag, imag*imag 1702 1703 1704#if defined(_AIX) 1705') 1706#else 1707.endm 1708#endif 1709 1710#if defined(_AIX) 1711define(`KERNEL2x2_SUB1', ` 1712#else 1713.macro KERNEL2x2_SUB1 1714#endif 1715 1716 lxvd2x vs0, o0, AO // load real,imag from A 1717 lxvd2x vs1, o16, AO // load real,imag from A 1718 1719 addi AO, AO, 32 1720 1721 lxvd2x vs16, o0, BO // load real part from B 1722 lxvd2x vs17, o16, BO // load imag part from B 1723 lxvd2x vs18, o32, BO // load real part from B 1724 lxvd2x vs19, o48, BO // load imag part from B 1725 1726 addi BO, BO, 64 1727 1728 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 1729 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 1730 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 1731 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 1732 1733 xvmaddadp vs36, vs0, vs18 // real*real, imag*real 1734 xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag 1735 xvmaddadp vs38, vs1, vs18 // real*real, imag*real 1736 xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag 1737 1738 1739#if defined(_AIX) 1740') 1741#else 1742.endm 1743#endif 1744 1745#if defined(_AIX) 1746define(`SAVE2x2', ` 1747#else 1748.macro SAVE2x2 1749#endif 1750 1751 1752 mr T1, CO 1753 1754#ifndef TRMMKERNEL 1755 1756 lxvd2x vs16, o0, T1 1757 lxvd2x vs17, o16, T1 1758 1759#endif 1760 1761 1762 xxlxor vs0, vs0, vs0 1763 xxlxor vs1, vs1, vs1 1764 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1765 1766 XSFADD_R1 vs0, vs0, vs32 // realA*realB 1767 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 1768 1769 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1770 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1771 1772 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 1773 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 1774 1775 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1776 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1777 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1778 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1779 1780 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1781 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1782 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 1783 1784 1785 1786 xxlxor vs0, vs0, vs0 1787 xxlxor vs1, vs1, vs1 1788 XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1789 1790 XSFADD_R1 vs0, vs0, vs34 // realA*realB 1791 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB 1792 1793 XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1794 XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1795 1796 XSFADD_I1 vs1, vs1, vs34 // realA*imagB 1797 XSFADD_I2 vs1, vs1, vs35 // imagA*realB 1798 1799 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1800 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1801 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1802 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1803 1804 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1805 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1806 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 1807 1808 1809#ifndef TRMMKERNEL 1810 1811 xvadddp vs8, vs8, vs16 1812 xvadddp vs9, vs9, vs17 1813 1814#endif 1815 1816 stxvd2x vs8, o0, T1 1817 stxvd2x vs9, o16, T1 1818 1819 add T1, T1, LDC 1820 1821#ifndef TRMMKERNEL 1822 1823 lxvd2x vs16, o0, T1 1824 lxvd2x vs17, o16, T1 1825 1826#endif 1827 1828 1829 xxlxor vs0, vs0, vs0 1830 xxlxor vs1, vs1, vs1 1831 XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1832 1833 XSFADD_R1 vs0, vs0, vs36 // realA*realB 1834 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB 1835 1836 XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1837 XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1838 1839 XSFADD_I1 vs1, vs1, vs36 // realA*imagB 1840 XSFADD_I2 vs1, vs1, vs37 // imagA*realB 1841 1842 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1843 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1844 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1845 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1846 1847 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1848 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1849 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 1850 1851 1852 1853 xxlxor vs0, vs0, vs0 1854 xxlxor vs1, vs1, vs1 1855 XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 1856 1857 XSFADD_R1 vs0, vs0, vs38 // realA*realB 1858 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB 1859 1860 XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB 1861 XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 1862 1863 XSFADD_I1 vs1, vs1, vs38 // realA*imagB 1864 XSFADD_I2 vs1, vs1, vs39 // imagA*realB 1865 1866 xsmuldp vs4, vs0, alpha_r // real*alpha_r 1867 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 1868 xsmuldp vs6, vs0, alpha_i // real*alpha_i 1869 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 1870 1871 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 1872 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 1873 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 1874 1875 1876#ifndef TRMMKERNEL 1877 1878 xvadddp vs8, vs8, vs16 1879 xvadddp vs9, vs9, vs17 1880 1881#endif 1882 1883 stxvd2x vs8, o0, T1 1884 stxvd2x vs9, o16, T1 1885 1886 add T1, T1, LDC 1887 addi CO, CO, 32 1888 1889#if defined(_AIX) 1890') 1891#else 1892.endm 1893#endif 1894 1895 1896/********************************************************************************************** 1897* Macros for N=2 and M=1 1898**********************************************************************************************/ 1899 1900#if defined(_AIX) 1901define(`LOAD2x1_1', ` 1902#else 1903.macro LOAD2x1_1 1904#endif 1905 1906 lxvd2x vs16, o0, BO // load real part from B 1907 lxvd2x vs17, o16, BO // load imag part from B 1908 lxvd2x vs18, o32, BO // load real part from B 1909 lxvd2x vs19, o48, BO // load imag part from B 1910 1911 addi BO, BO, 64 1912 1913 lxvd2x vs0, o0, AO // load real,imag from A 1914 1915 addi AO, AO, 16 1916 1917 1918#if defined(_AIX) 1919') 1920#else 1921.endm 1922#endif 1923 1924#if defined(_AIX) 1925define(`KERNEL2x1_I1', ` 1926#else 1927.macro KERNEL2x1_I1 1928#endif 1929 1930 lxvd2x vs8, o0, AO // load real,imag from A 1931 1932 addi AO, AO, 16 1933 1934 lxvd2x vs20, o0, BO // load real part from B 1935 lxvd2x vs21, o16, BO // load imag part from B 1936 lxvd2x vs22, o32, BO // load real part from B 1937 lxvd2x vs23, o48, BO // load imag part from B 1938 1939 addi BO, BO, 64 1940 1941 xvmuldp vs32, vs0, vs16 // real*real, imag*real 1942 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 1943 1944 xvmuldp vs34, vs0, vs18 // real*real, imag*real 1945 xvmuldp vs35, vs0, vs19 // real*imag, imag*imag 1946 1947 1948#if defined(_AIX) 1949') 1950#else 1951.endm 1952#endif 1953 1954#if defined(_AIX) 1955define(`KERNEL2x1_1', ` 1956#else 1957.macro KERNEL2x1_1 1958#endif 1959 1960 lxvd2x vs8, o0, AO // load real,imag from A 1961 1962 addi AO, AO, 16 1963 1964 lxvd2x vs20, o0, BO // load real part from B 1965 lxvd2x vs21, o16, BO // load imag part from B 1966 lxvd2x vs22, o32, BO // load real part from B 1967 lxvd2x vs23, o48, BO // load imag part from B 1968 1969 addi BO, BO, 64 1970 1971 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 1972 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 1973 1974 xvmaddadp vs34, vs0, vs18 // real*real, imag*real 1975 xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag 1976 1977 1978#if defined(_AIX) 1979') 1980#else 1981.endm 1982#endif 1983 1984#if defined(_AIX) 1985define(`KERNEL2x1_2', ` 1986#else 1987.macro KERNEL2x1_2 1988#endif 1989 1990 lxvd2x vs0, o0, AO // load real,imag from A 1991 1992 addi AO, AO, 16 1993 1994 lxvd2x vs16, o0, BO // load real part from B 1995 lxvd2x vs17, o16, BO // load imag part from B 1996 lxvd2x vs18, o32, BO // load real part from B 1997 lxvd2x vs19, o48, BO // load imag part from B 1998 1999 addi BO, BO, 64 2000 2001 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 2002 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 2003 2004 xvmaddadp vs34, vs8, vs22 // real*real, imag*real 2005 xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag 2006 2007 2008#if defined(_AIX) 2009') 2010#else 2011.endm 2012#endif 2013 2014#if defined(_AIX) 2015define(`KERNEL2x1_E2', ` 2016#else 2017.macro KERNEL2x1_E2 2018#endif 2019 2020 2021 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 2022 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 2023 2024 xvmaddadp vs34, vs8, vs22 // real*real, imag*real 2025 xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag 2026 2027 2028#if defined(_AIX) 2029') 2030#else 2031.endm 2032#endif 2033 2034#if defined(_AIX) 2035define(`KERNEL2x1_SUBI1', ` 2036#else 2037.macro KERNEL2x1_SUBI1 2038#endif 2039 2040 lxvd2x vs0, o0, AO // load real,imag from A 2041 2042 addi AO, AO, 16 2043 2044 lxvd2x vs16, o0, BO // load real part from B 2045 lxvd2x vs17, o16, BO // load imag part from B 2046 lxvd2x vs18, o32, BO // load real part from B 2047 lxvd2x vs19, o48, BO // load imag part from B 2048 2049 addi BO, BO, 64 2050 2051 xvmuldp vs32, vs0, vs16 // real*real, imag*real 2052 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 2053 2054 xvmuldp vs34, vs0, vs18 // real*real, imag*real 2055 xvmuldp vs35, vs0, vs19 // real*imag, imag*imag 2056 2057 2058#if defined(_AIX) 2059') 2060#else 2061.endm 2062#endif 2063 2064#if defined(_AIX) 2065define(`KERNEL2x1_SUB1', ` 2066#else 2067.macro KERNEL2x1_SUB1 2068#endif 2069 2070 lxvd2x vs0, o0, AO // load real,imag from A 2071 2072 addi AO, AO, 16 2073 2074 lxvd2x vs16, o0, BO // load real part from B 2075 lxvd2x vs17, o16, BO // load imag part from B 2076 lxvd2x vs18, o32, BO // load real part from B 2077 lxvd2x vs19, o48, BO // load imag part from B 2078 2079 addi BO, BO, 64 2080 2081 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 2082 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 2083 2084 xvmaddadp vs34, vs0, vs18 // real*real, imag*real 2085 xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag 2086 2087 2088#if defined(_AIX) 2089') 2090#else 2091.endm 2092#endif 2093 2094#if defined(_AIX) 2095define(`SAVE2x1', ` 2096#else 2097.macro SAVE2x1 2098#endif 2099 2100 2101 mr T1, CO 2102 2103#ifndef TRMMKERNEL 2104 2105 lxvd2x vs16, o0, T1 2106 2107#endif 2108 2109 2110 xxlxor vs0, vs0, vs0 2111 xxlxor vs1, vs1, vs1 2112 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2113 2114 XSFADD_R1 vs0, vs0, vs32 // realA*realB 2115 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 2116 2117 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2118 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2119 2120 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 2121 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 2122 2123 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2124 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2125 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2126 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2127 2128 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2129 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2130 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 2131 2132 2133#ifndef TRMMKERNEL 2134 2135 xvadddp vs8, vs8, vs16 2136 2137#endif 2138 2139 stxvd2x vs8, o0, T1 2140 2141 add T1, T1, LDC 2142 2143#ifndef TRMMKERNEL 2144 2145 lxvd2x vs16, o0, T1 2146 2147#endif 2148 2149 2150 xxlxor vs0, vs0, vs0 2151 xxlxor vs1, vs1, vs1 2152 XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2153 2154 XSFADD_R1 vs0, vs0, vs34 // realA*realB 2155 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB 2156 2157 XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2158 XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2159 2160 XSFADD_I1 vs1, vs1, vs34 // realA*imagB 2161 XSFADD_I2 vs1, vs1, vs35 // imagA*realB 2162 2163 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2164 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2165 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2166 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2167 2168 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2169 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2170 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 2171 2172 2173#ifndef TRMMKERNEL 2174 2175 xvadddp vs8, vs8, vs16 2176 2177#endif 2178 2179 stxvd2x vs8, o0, T1 2180 2181 add T1, T1, LDC 2182 addi CO, CO, 16 2183 2184#if defined(_AIX) 2185') 2186#else 2187.endm 2188#endif 2189 2190 2191/********************************************************************************************** 2192* Macros for N=1 and M=8 2193**********************************************************************************************/ 2194 2195#if defined(_AIX) 2196define(`LOAD1x8_1', ` 2197#else 2198.macro LOAD1x8_1 2199#endif 2200 2201 lxvd2x vs16, o0, BO // load real part from B 2202 lxvd2x vs17, o16, BO // load imag part from B 2203 2204 addi BO, BO, 32 2205 2206 lxvd2x vs0, o0, AO // load real,imag from A 2207 lxvd2x vs1, o16, AO // load real,imag from A 2208 lxvd2x vs2, o32, AO // load real,imag from A 2209 lxvd2x vs3, o48, AO // load real,imag from A 2210 2211 addi AO, AO, 64 2212 2213 lxvd2x vs4, o0, AO // load real,imag from A 2214 lxvd2x vs5, o16, AO // load real,imag from A 2215 lxvd2x vs6, o32, AO // load real,imag from A 2216 lxvd2x vs7, o48, AO // load real,imag from A 2217 2218 addi AO, AO, 64 2219 2220 2221#if defined(_AIX) 2222') 2223#else 2224.endm 2225#endif 2226 2227#if defined(_AIX) 2228define(`KERNEL1x8_I1', ` 2229#else 2230.macro KERNEL1x8_I1 2231#endif 2232 2233 lxvd2x vs8, o0, AO // load real,imag from A 2234 lxvd2x vs9, o16, AO // load real,imag from A 2235 lxvd2x vs10, o32, AO // load real,imag from A 2236 lxvd2x vs11, o48, AO // load real,imag from A 2237 2238 addi AO, AO, 64 2239 2240 lxvd2x vs12, o0, AO // load real,imag from A 2241 lxvd2x vs13, o16, AO // load real,imag from A 2242 lxvd2x vs14, o32, AO // load real,imag from A 2243 lxvd2x vs15, o48, AO // load real,imag from A 2244 2245 addi AO, AO, 64 2246 2247 lxvd2x vs20, o0, BO // load real part from B 2248 lxvd2x vs21, o16, BO // load imag part from B 2249 2250 addi BO, BO, 32 2251 2252 xvmuldp vs32, vs0, vs16 // real*real, imag*real 2253 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 2254 xvmuldp vs34, vs1, vs16 // real*real, imag*real 2255 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 2256 xvmuldp vs36, vs2, vs16 // real*real, imag*real 2257 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 2258 xvmuldp vs38, vs3, vs16 // real*real, imag*real 2259 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 2260 xvmuldp vs40, vs4, vs16 // real*real, imag*real 2261 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag 2262 xvmuldp vs42, vs5, vs16 // real*real, imag*real 2263 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag 2264 xvmuldp vs44, vs6, vs16 // real*real, imag*real 2265 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag 2266 xvmuldp vs46, vs7, vs16 // real*real, imag*real 2267 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag 2268 2269 2270#if defined(_AIX) 2271') 2272#else 2273.endm 2274#endif 2275 2276#if defined(_AIX) 2277define(`KERNEL1x8_1', ` 2278#else 2279.macro KERNEL1x8_1 2280#endif 2281 2282 lxvd2x vs8, o0, AO // load real,imag from A 2283 lxvd2x vs9, o16, AO // load real,imag from A 2284 lxvd2x vs10, o32, AO // load real,imag from A 2285 lxvd2x vs11, o48, AO // load real,imag from A 2286 2287 addi AO, AO, 64 2288 2289 lxvd2x vs12, o0, AO // load real,imag from A 2290 lxvd2x vs13, o16, AO // load real,imag from A 2291 lxvd2x vs14, o32, AO // load real,imag from A 2292 lxvd2x vs15, o48, AO // load real,imag from A 2293 2294 addi AO, AO, 64 2295 2296 lxvd2x vs20, o0, BO // load real part from B 2297 lxvd2x vs21, o16, BO // load imag part from B 2298 2299 addi BO, BO, 32 2300 2301 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 2302 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 2303 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 2304 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 2305 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 2306 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 2307 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 2308 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 2309 xvmaddadp vs40, vs4, vs16 // real*real, imag*real 2310 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag 2311 xvmaddadp vs42, vs5, vs16 // real*real, imag*real 2312 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag 2313 xvmaddadp vs44, vs6, vs16 // real*real, imag*real 2314 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag 2315 xvmaddadp vs46, vs7, vs16 // real*real, imag*real 2316 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag 2317 2318 2319#if defined(_AIX) 2320') 2321#else 2322.endm 2323#endif 2324 2325#if defined(_AIX) 2326define(`KERNEL1x8_2', ` 2327#else 2328.macro KERNEL1x8_2 2329#endif 2330 2331 lxvd2x vs0, o0, AO // load real,imag from A 2332 lxvd2x vs1, o16, AO // load real,imag from A 2333 lxvd2x vs2, o32, AO // load real,imag from A 2334 lxvd2x vs3, o48, AO // load real,imag from A 2335 2336 addi AO, AO, 64 2337 2338 lxvd2x vs4, o0, AO // load real,imag from A 2339 lxvd2x vs5, o16, AO // load real,imag from A 2340 lxvd2x vs6, o32, AO // load real,imag from A 2341 lxvd2x vs7, o48, AO // load real,imag from A 2342 2343 addi AO, AO, 64 2344 2345 lxvd2x vs16, o0, BO // load real part from B 2346 lxvd2x vs17, o16, BO // load imag part from B 2347 2348 addi BO, BO, 32 2349 2350 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 2351 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 2352 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 2353 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 2354 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 2355 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 2356 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 2357 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 2358 xvmaddadp vs40, vs12, vs20 // real*real, imag*real 2359 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag 2360 xvmaddadp vs42, vs13, vs20 // real*real, imag*real 2361 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag 2362 xvmaddadp vs44, vs14, vs20 // real*real, imag*real 2363 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag 2364 xvmaddadp vs46, vs15, vs20 // real*real, imag*real 2365 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag 2366 2367 2368#if defined(_AIX) 2369') 2370#else 2371.endm 2372#endif 2373 2374#if defined(_AIX) 2375define(`KERNEL1x8_E2', ` 2376#else 2377.macro KERNEL1x8_E2 2378#endif 2379 2380 2381 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 2382 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 2383 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 2384 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 2385 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 2386 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 2387 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 2388 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 2389 xvmaddadp vs40, vs12, vs20 // real*real, imag*real 2390 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag 2391 xvmaddadp vs42, vs13, vs20 // real*real, imag*real 2392 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag 2393 xvmaddadp vs44, vs14, vs20 // real*real, imag*real 2394 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag 2395 xvmaddadp vs46, vs15, vs20 // real*real, imag*real 2396 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag 2397 2398 2399#if defined(_AIX) 2400') 2401#else 2402.endm 2403#endif 2404 2405#if defined(_AIX) 2406define(`KERNEL1x8_SUBI1', ` 2407#else 2408.macro KERNEL1x8_SUBI1 2409#endif 2410 2411 lxvd2x vs0, o0, AO // load real,imag from A 2412 lxvd2x vs1, o16, AO // load real,imag from A 2413 lxvd2x vs2, o32, AO // load real,imag from A 2414 lxvd2x vs3, o48, AO // load real,imag from A 2415 2416 addi AO, AO, 64 2417 2418 lxvd2x vs4, o0, AO // load real,imag from A 2419 lxvd2x vs5, o16, AO // load real,imag from A 2420 lxvd2x vs6, o32, AO // load real,imag from A 2421 lxvd2x vs7, o48, AO // load real,imag from A 2422 2423 addi AO, AO, 64 2424 2425 lxvd2x vs16, o0, BO // load real part from B 2426 lxvd2x vs17, o16, BO // load imag part from B 2427 2428 addi BO, BO, 32 2429 2430 xvmuldp vs32, vs0, vs16 // real*real, imag*real 2431 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 2432 xvmuldp vs34, vs1, vs16 // real*real, imag*real 2433 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 2434 xvmuldp vs36, vs2, vs16 // real*real, imag*real 2435 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 2436 xvmuldp vs38, vs3, vs16 // real*real, imag*real 2437 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 2438 xvmuldp vs40, vs4, vs16 // real*real, imag*real 2439 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag 2440 xvmuldp vs42, vs5, vs16 // real*real, imag*real 2441 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag 2442 xvmuldp vs44, vs6, vs16 // real*real, imag*real 2443 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag 2444 xvmuldp vs46, vs7, vs16 // real*real, imag*real 2445 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag 2446 2447 2448#if defined(_AIX) 2449') 2450#else 2451.endm 2452#endif 2453 2454#if defined(_AIX) 2455define(`KERNEL1x8_SUB1', ` 2456#else 2457.macro KERNEL1x8_SUB1 2458#endif 2459 2460 lxvd2x vs0, o0, AO // load real,imag from A 2461 lxvd2x vs1, o16, AO // load real,imag from A 2462 lxvd2x vs2, o32, AO // load real,imag from A 2463 lxvd2x vs3, o48, AO // load real,imag from A 2464 2465 addi AO, AO, 64 2466 2467 lxvd2x vs4, o0, AO // load real,imag from A 2468 lxvd2x vs5, o16, AO // load real,imag from A 2469 lxvd2x vs6, o32, AO // load real,imag from A 2470 lxvd2x vs7, o48, AO // load real,imag from A 2471 2472 addi AO, AO, 64 2473 2474 lxvd2x vs16, o0, BO // load real part from B 2475 lxvd2x vs17, o16, BO // load imag part from B 2476 2477 addi BO, BO, 32 2478 2479 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 2480 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 2481 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 2482 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 2483 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 2484 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 2485 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 2486 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 2487 xvmaddadp vs40, vs4, vs16 // real*real, imag*real 2488 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag 2489 xvmaddadp vs42, vs5, vs16 // real*real, imag*real 2490 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag 2491 xvmaddadp vs44, vs6, vs16 // real*real, imag*real 2492 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag 2493 xvmaddadp vs46, vs7, vs16 // real*real, imag*real 2494 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag 2495 2496 2497#if defined(_AIX) 2498') 2499#else 2500.endm 2501#endif 2502 2503#if defined(_AIX) 2504define(`SAVE1x8', ` 2505#else 2506.macro SAVE1x8 2507#endif 2508 2509 2510 mr T1, CO 2511 addi T2, T1, 64 2512 2513#ifndef TRMMKERNEL 2514 2515 lxvd2x vs16, o0, T1 2516 lxvd2x vs17, o16, T1 2517 lxvd2x vs18, o32, T1 2518 lxvd2x vs19, o48, T1 2519 lxvd2x vs20, o0, T2 2520 lxvd2x vs21, o16, T2 2521 lxvd2x vs22, o32, T2 2522 lxvd2x vs23, o48, T2 2523 2524#endif 2525 2526 2527 xxlxor vs0, vs0, vs0 2528 xxlxor vs1, vs1, vs1 2529 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2530 2531 XSFADD_R1 vs0, vs0, vs32 // realA*realB 2532 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 2533 2534 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2535 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2536 2537 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 2538 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 2539 2540 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2541 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2542 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2543 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2544 2545 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2546 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2547 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 2548 2549 2550 2551 xxlxor vs0, vs0, vs0 2552 xxlxor vs1, vs1, vs1 2553 XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2554 2555 XSFADD_R1 vs0, vs0, vs34 // realA*realB 2556 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB 2557 2558 XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2559 XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2560 2561 XSFADD_I1 vs1, vs1, vs34 // realA*imagB 2562 XSFADD_I2 vs1, vs1, vs35 // imagA*realB 2563 2564 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2565 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2566 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2567 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2568 2569 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2570 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2571 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 2572 2573 2574 2575 xxlxor vs0, vs0, vs0 2576 xxlxor vs1, vs1, vs1 2577 XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2578 2579 XSFADD_R1 vs0, vs0, vs36 // realA*realB 2580 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB 2581 2582 XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2583 XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2584 2585 XSFADD_I1 vs1, vs1, vs36 // realA*imagB 2586 XSFADD_I2 vs1, vs1, vs37 // imagA*realB 2587 2588 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2589 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2590 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2591 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2592 2593 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2594 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2595 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part 2596 2597 2598 2599 xxlxor vs0, vs0, vs0 2600 xxlxor vs1, vs1, vs1 2601 XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2602 2603 XSFADD_R1 vs0, vs0, vs38 // realA*realB 2604 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB 2605 2606 XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2607 XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2608 2609 XSFADD_I1 vs1, vs1, vs38 // realA*imagB 2610 XSFADD_I2 vs1, vs1, vs39 // imagA*realB 2611 2612 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2613 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2614 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2615 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2616 2617 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2618 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2619 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part 2620 2621 2622 2623 xxlxor vs0, vs0, vs0 2624 xxlxor vs1, vs1, vs1 2625 XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2626 2627 XSFADD_R1 vs0, vs0, vs40 // realA*realB 2628 XSFADD_R2 vs0, vs0, vs41 // imagA*imagB 2629 2630 XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2631 XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2632 2633 XSFADD_I1 vs1, vs1, vs40 // realA*imagB 2634 XSFADD_I2 vs1, vs1, vs41 // imagA*realB 2635 2636 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2637 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2638 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2639 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2640 2641 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2642 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2643 xxpermdi vs12, vs2, vs3, 0 // merge real and imag part 2644 2645 2646 2647 xxlxor vs0, vs0, vs0 2648 xxlxor vs1, vs1, vs1 2649 XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2650 2651 XSFADD_R1 vs0, vs0, vs42 // realA*realB 2652 XSFADD_R2 vs0, vs0, vs43 // imagA*imagB 2653 2654 XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2655 XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2656 2657 XSFADD_I1 vs1, vs1, vs42 // realA*imagB 2658 XSFADD_I2 vs1, vs1, vs43 // imagA*realB 2659 2660 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2661 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2662 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2663 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2664 2665 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2666 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2667 xxpermdi vs13, vs2, vs3, 0 // merge real and imag part 2668 2669 2670 2671 xxlxor vs0, vs0, vs0 2672 xxlxor vs1, vs1, vs1 2673 XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2674 2675 XSFADD_R1 vs0, vs0, vs44 // realA*realB 2676 XSFADD_R2 vs0, vs0, vs45 // imagA*imagB 2677 2678 XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2679 XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2680 2681 XSFADD_I1 vs1, vs1, vs44 // realA*imagB 2682 XSFADD_I2 vs1, vs1, vs45 // imagA*realB 2683 2684 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2685 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2686 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2687 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2688 2689 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2690 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2691 xxpermdi vs14, vs2, vs3, 0 // merge real and imag part 2692 2693 2694 2695 xxlxor vs0, vs0, vs0 2696 xxlxor vs1, vs1, vs1 2697 XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2698 2699 XSFADD_R1 vs0, vs0, vs46 // realA*realB 2700 XSFADD_R2 vs0, vs0, vs47 // imagA*imagB 2701 2702 XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB 2703 XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 2704 2705 XSFADD_I1 vs1, vs1, vs46 // realA*imagB 2706 XSFADD_I2 vs1, vs1, vs47 // imagA*realB 2707 2708 xsmuldp vs4, vs0, alpha_r // real*alpha_r 2709 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 2710 xsmuldp vs6, vs0, alpha_i // real*alpha_i 2711 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 2712 2713 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 2714 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 2715 xxpermdi vs15, vs2, vs3, 0 // merge real and imag part 2716 2717 2718#ifndef TRMMKERNEL 2719 2720 xvadddp vs8, vs8, vs16 2721 xvadddp vs9, vs9, vs17 2722 xvadddp vs10, vs10, vs18 2723 xvadddp vs11, vs11, vs19 2724 xvadddp vs12, vs12, vs20 2725 xvadddp vs13, vs13, vs21 2726 xvadddp vs14, vs14, vs22 2727 xvadddp vs15, vs15, vs23 2728 2729#endif 2730 2731 stxvd2x vs8, o0, T1 2732 stxvd2x vs9, o16, T1 2733 stxvd2x vs10, o32, T1 2734 stxvd2x vs11, o48, T1 2735 stxvd2x vs12, o0, T2 2736 stxvd2x vs13, o16, T2 2737 stxvd2x vs14, o32, T2 2738 stxvd2x vs15, o48, T2 2739 2740 add T1, T1, LDC 2741 add T2, T2, LDC 2742 addi CO, CO, 128 2743 2744#if defined(_AIX) 2745') 2746#else 2747.endm 2748#endif 2749 2750 2751/********************************************************************************************** 2752* Macros for N=1 and M=4 2753**********************************************************************************************/ 2754 2755#if defined(_AIX) 2756define(`LOAD1x4_1', ` 2757#else 2758.macro LOAD1x4_1 2759#endif 2760 2761 lxvd2x vs16, o0, BO // load real part from B 2762 lxvd2x vs17, o16, BO // load imag part from B 2763 2764 addi BO, BO, 32 2765 2766 lxvd2x vs0, o0, AO // load real,imag from A 2767 lxvd2x vs1, o16, AO // load real,imag from A 2768 lxvd2x vs2, o32, AO // load real,imag from A 2769 lxvd2x vs3, o48, AO // load real,imag from A 2770 2771 addi AO, AO, 64 2772 2773 2774#if defined(_AIX) 2775') 2776#else 2777.endm 2778#endif 2779 2780#if defined(_AIX) 2781define(`KERNEL1x4_I1', ` 2782#else 2783.macro KERNEL1x4_I1 2784#endif 2785 2786 lxvd2x vs8, o0, AO // load real,imag from A 2787 lxvd2x vs9, o16, AO // load real,imag from A 2788 lxvd2x vs10, o32, AO // load real,imag from A 2789 lxvd2x vs11, o48, AO // load real,imag from A 2790 2791 addi AO, AO, 64 2792 2793 lxvd2x vs20, o0, BO // load real part from B 2794 lxvd2x vs21, o16, BO // load imag part from B 2795 2796 addi BO, BO, 32 2797 2798 xvmuldp vs32, vs0, vs16 // real*real, imag*real 2799 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 2800 xvmuldp vs34, vs1, vs16 // real*real, imag*real 2801 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 2802 xvmuldp vs36, vs2, vs16 // real*real, imag*real 2803 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 2804 xvmuldp vs38, vs3, vs16 // real*real, imag*real 2805 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 2806 2807 2808#if defined(_AIX) 2809') 2810#else 2811.endm 2812#endif 2813 2814#if defined(_AIX) 2815define(`KERNEL1x4_1', ` 2816#else 2817.macro KERNEL1x4_1 2818#endif 2819 2820 lxvd2x vs8, o0, AO // load real,imag from A 2821 lxvd2x vs9, o16, AO // load real,imag from A 2822 lxvd2x vs10, o32, AO // load real,imag from A 2823 lxvd2x vs11, o48, AO // load real,imag from A 2824 2825 addi AO, AO, 64 2826 2827 lxvd2x vs20, o0, BO // load real part from B 2828 lxvd2x vs21, o16, BO // load imag part from B 2829 2830 addi BO, BO, 32 2831 2832 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 2833 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 2834 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 2835 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 2836 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 2837 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 2838 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 2839 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 2840 2841 2842#if defined(_AIX) 2843') 2844#else 2845.endm 2846#endif 2847 2848#if defined(_AIX) 2849define(`KERNEL1x4_2', ` 2850#else 2851.macro KERNEL1x4_2 2852#endif 2853 2854 lxvd2x vs0, o0, AO // load real,imag from A 2855 lxvd2x vs1, o16, AO // load real,imag from A 2856 lxvd2x vs2, o32, AO // load real,imag from A 2857 lxvd2x vs3, o48, AO // load real,imag from A 2858 2859 addi AO, AO, 64 2860 2861 lxvd2x vs16, o0, BO // load real part from B 2862 lxvd2x vs17, o16, BO // load imag part from B 2863 2864 addi BO, BO, 32 2865 2866 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 2867 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 2868 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 2869 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 2870 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 2871 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 2872 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 2873 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 2874 2875 2876#if defined(_AIX) 2877') 2878#else 2879.endm 2880#endif 2881 2882#if defined(_AIX) 2883define(`KERNEL1x4_E2', ` 2884#else 2885.macro KERNEL1x4_E2 2886#endif 2887 2888 2889 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 2890 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 2891 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 2892 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 2893 xvmaddadp vs36, vs10, vs20 // real*real, imag*real 2894 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag 2895 xvmaddadp vs38, vs11, vs20 // real*real, imag*real 2896 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag 2897 2898 2899#if defined(_AIX) 2900') 2901#else 2902.endm 2903#endif 2904 2905#if defined(_AIX) 2906define(`KERNEL1x4_SUBI1', ` 2907#else 2908.macro KERNEL1x4_SUBI1 2909#endif 2910 2911 lxvd2x vs0, o0, AO // load real,imag from A 2912 lxvd2x vs1, o16, AO // load real,imag from A 2913 lxvd2x vs2, o32, AO // load real,imag from A 2914 lxvd2x vs3, o48, AO // load real,imag from A 2915 2916 addi AO, AO, 64 2917 2918 lxvd2x vs16, o0, BO // load real part from B 2919 lxvd2x vs17, o16, BO // load imag part from B 2920 2921 addi BO, BO, 32 2922 2923 xvmuldp vs32, vs0, vs16 // real*real, imag*real 2924 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 2925 xvmuldp vs34, vs1, vs16 // real*real, imag*real 2926 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 2927 xvmuldp vs36, vs2, vs16 // real*real, imag*real 2928 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag 2929 xvmuldp vs38, vs3, vs16 // real*real, imag*real 2930 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag 2931 2932 2933#if defined(_AIX) 2934') 2935#else 2936.endm 2937#endif 2938 2939#if defined(_AIX) 2940define(`KERNEL1x4_SUB1', ` 2941#else 2942.macro KERNEL1x4_SUB1 2943#endif 2944 2945 lxvd2x vs0, o0, AO // load real,imag from A 2946 lxvd2x vs1, o16, AO // load real,imag from A 2947 lxvd2x vs2, o32, AO // load real,imag from A 2948 lxvd2x vs3, o48, AO // load real,imag from A 2949 2950 addi AO, AO, 64 2951 2952 lxvd2x vs16, o0, BO // load real part from B 2953 lxvd2x vs17, o16, BO // load imag part from B 2954 2955 addi BO, BO, 32 2956 2957 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 2958 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 2959 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 2960 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 2961 xvmaddadp vs36, vs2, vs16 // real*real, imag*real 2962 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag 2963 xvmaddadp vs38, vs3, vs16 // real*real, imag*real 2964 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag 2965 2966 2967#if defined(_AIX) 2968') 2969#else 2970.endm 2971#endif 2972 2973#if defined(_AIX) 2974define(`SAVE1x4', ` 2975#else 2976.macro SAVE1x4 2977#endif 2978 2979 2980 mr T1, CO 2981 2982#ifndef TRMMKERNEL 2983 2984 lxvd2x vs16, o0, T1 2985 lxvd2x vs17, o16, T1 2986 lxvd2x vs18, o32, T1 2987 lxvd2x vs19, o48, T1 2988 2989#endif 2990 2991 2992 xxlxor vs0, vs0, vs0 2993 xxlxor vs1, vs1, vs1 2994 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 2995 2996 XSFADD_R1 vs0, vs0, vs32 // realA*realB 2997 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 2998 2999 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 3000 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 3001 3002 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 3003 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 3004 3005 xsmuldp vs4, vs0, alpha_r // real*alpha_r 3006 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 3007 xsmuldp vs6, vs0, alpha_i // real*alpha_i 3008 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 3009 3010 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 3011 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 3012 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 3013 3014 3015 3016 xxlxor vs0, vs0, vs0 3017 xxlxor vs1, vs1, vs1 3018 XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 3019 3020 XSFADD_R1 vs0, vs0, vs34 // realA*realB 3021 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB 3022 3023 XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB 3024 XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 3025 3026 XSFADD_I1 vs1, vs1, vs34 // realA*imagB 3027 XSFADD_I2 vs1, vs1, vs35 // imagA*realB 3028 3029 xsmuldp vs4, vs0, alpha_r // real*alpha_r 3030 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 3031 xsmuldp vs6, vs0, alpha_i // real*alpha_i 3032 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 3033 3034 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 3035 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 3036 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 3037 3038 3039 3040 xxlxor vs0, vs0, vs0 3041 xxlxor vs1, vs1, vs1 3042 XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 3043 3044 XSFADD_R1 vs0, vs0, vs36 // realA*realB 3045 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB 3046 3047 XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB 3048 XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 3049 3050 XSFADD_I1 vs1, vs1, vs36 // realA*imagB 3051 XSFADD_I2 vs1, vs1, vs37 // imagA*realB 3052 3053 xsmuldp vs4, vs0, alpha_r // real*alpha_r 3054 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 3055 xsmuldp vs6, vs0, alpha_i // real*alpha_i 3056 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 3057 3058 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 3059 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 3060 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part 3061 3062 3063 3064 xxlxor vs0, vs0, vs0 3065 xxlxor vs1, vs1, vs1 3066 XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 3067 3068 XSFADD_R1 vs0, vs0, vs38 // realA*realB 3069 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB 3070 3071 XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB 3072 XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 3073 3074 XSFADD_I1 vs1, vs1, vs38 // realA*imagB 3075 XSFADD_I2 vs1, vs1, vs39 // imagA*realB 3076 3077 xsmuldp vs4, vs0, alpha_r // real*alpha_r 3078 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 3079 xsmuldp vs6, vs0, alpha_i // real*alpha_i 3080 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 3081 3082 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 3083 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 3084 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part 3085 3086 3087#ifndef TRMMKERNEL 3088 3089 xvadddp vs8, vs8, vs16 3090 xvadddp vs9, vs9, vs17 3091 xvadddp vs10, vs10, vs18 3092 xvadddp vs11, vs11, vs19 3093 3094#endif 3095 3096 stxvd2x vs8, o0, T1 3097 stxvd2x vs9, o16, T1 3098 stxvd2x vs10, o32, T1 3099 stxvd2x vs11, o48, T1 3100 3101 add T1, T1, LDC 3102 addi CO, CO, 64 3103 3104#if defined(_AIX) 3105') 3106#else 3107.endm 3108#endif 3109 3110 3111/********************************************************************************************** 3112* Macros for N=1 and M=2 3113**********************************************************************************************/ 3114 3115#if defined(_AIX) 3116define(`LOAD1x2_1', ` 3117#else 3118.macro LOAD1x2_1 3119#endif 3120 3121 lxvd2x vs16, o0, BO // load real part from B 3122 lxvd2x vs17, o16, BO // load imag part from B 3123 3124 addi BO, BO, 32 3125 3126 lxvd2x vs0, o0, AO // load real,imag from A 3127 lxvd2x vs1, o16, AO // load real,imag from A 3128 3129 addi AO, AO, 32 3130 3131 3132#if defined(_AIX) 3133') 3134#else 3135.endm 3136#endif 3137 3138#if defined(_AIX) 3139define(`KERNEL1x2_I1', ` 3140#else 3141.macro KERNEL1x2_I1 3142#endif 3143 3144 lxvd2x vs8, o0, AO // load real,imag from A 3145 lxvd2x vs9, o16, AO // load real,imag from A 3146 3147 addi AO, AO, 32 3148 3149 lxvd2x vs20, o0, BO // load real part from B 3150 lxvd2x vs21, o16, BO // load imag part from B 3151 3152 addi BO, BO, 32 3153 3154 xvmuldp vs32, vs0, vs16 // real*real, imag*real 3155 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 3156 xvmuldp vs34, vs1, vs16 // real*real, imag*real 3157 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 3158 3159 3160#if defined(_AIX) 3161') 3162#else 3163.endm 3164#endif 3165 3166#if defined(_AIX) 3167define(`KERNEL1x2_1', ` 3168#else 3169.macro KERNEL1x2_1 3170#endif 3171 3172 lxvd2x vs8, o0, AO // load real,imag from A 3173 lxvd2x vs9, o16, AO // load real,imag from A 3174 3175 addi AO, AO, 32 3176 3177 lxvd2x vs20, o0, BO // load real part from B 3178 lxvd2x vs21, o16, BO // load imag part from B 3179 3180 addi BO, BO, 32 3181 3182 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 3183 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 3184 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 3185 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 3186 3187 3188#if defined(_AIX) 3189') 3190#else 3191.endm 3192#endif 3193 3194#if defined(_AIX) 3195define(`KERNEL1x2_2', ` 3196#else 3197.macro KERNEL1x2_2 3198#endif 3199 3200 lxvd2x vs0, o0, AO // load real,imag from A 3201 lxvd2x vs1, o16, AO // load real,imag from A 3202 3203 addi AO, AO, 32 3204 3205 lxvd2x vs16, o0, BO // load real part from B 3206 lxvd2x vs17, o16, BO // load imag part from B 3207 3208 addi BO, BO, 32 3209 3210 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 3211 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 3212 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 3213 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 3214 3215 3216#if defined(_AIX) 3217') 3218#else 3219.endm 3220#endif 3221 3222#if defined(_AIX) 3223define(`KERNEL1x2_E2', ` 3224#else 3225.macro KERNEL1x2_E2 3226#endif 3227 3228 3229 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 3230 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 3231 xvmaddadp vs34, vs9, vs20 // real*real, imag*real 3232 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag 3233 3234 3235#if defined(_AIX) 3236') 3237#else 3238.endm 3239#endif 3240 3241#if defined(_AIX) 3242define(`KERNEL1x2_SUBI1', ` 3243#else 3244.macro KERNEL1x2_SUBI1 3245#endif 3246 3247 lxvd2x vs0, o0, AO // load real,imag from A 3248 lxvd2x vs1, o16, AO // load real,imag from A 3249 3250 addi AO, AO, 32 3251 3252 lxvd2x vs16, o0, BO // load real part from B 3253 lxvd2x vs17, o16, BO // load imag part from B 3254 3255 addi BO, BO, 32 3256 3257 xvmuldp vs32, vs0, vs16 // real*real, imag*real 3258 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 3259 xvmuldp vs34, vs1, vs16 // real*real, imag*real 3260 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag 3261 3262 3263#if defined(_AIX) 3264') 3265#else 3266.endm 3267#endif 3268 3269#if defined(_AIX) 3270define(`KERNEL1x2_SUB1', ` 3271#else 3272.macro KERNEL1x2_SUB1 3273#endif 3274 3275 lxvd2x vs0, o0, AO // load real,imag from A 3276 lxvd2x vs1, o16, AO // load real,imag from A 3277 3278 addi AO, AO, 32 3279 3280 lxvd2x vs16, o0, BO // load real part from B 3281 lxvd2x vs17, o16, BO // load imag part from B 3282 3283 addi BO, BO, 32 3284 3285 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 3286 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 3287 xvmaddadp vs34, vs1, vs16 // real*real, imag*real 3288 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag 3289 3290 3291#if defined(_AIX) 3292') 3293#else 3294.endm 3295#endif 3296 3297#if defined(_AIX) 3298define(`SAVE1x2', ` 3299#else 3300.macro SAVE1x2 3301#endif 3302 3303 3304 mr T1, CO 3305 3306#ifndef TRMMKERNEL 3307 3308 lxvd2x vs16, o0, T1 3309 lxvd2x vs17, o16, T1 3310 3311#endif 3312 3313 3314 xxlxor vs0, vs0, vs0 3315 xxlxor vs1, vs1, vs1 3316 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 3317 3318 XSFADD_R1 vs0, vs0, vs32 // realA*realB 3319 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 3320 3321 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 3322 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 3323 3324 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 3325 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 3326 3327 xsmuldp vs4, vs0, alpha_r // real*alpha_r 3328 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 3329 xsmuldp vs6, vs0, alpha_i // real*alpha_i 3330 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 3331 3332 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 3333 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 3334 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 3335 3336 3337 3338 xxlxor vs0, vs0, vs0 3339 xxlxor vs1, vs1, vs1 3340 XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 3341 3342 XSFADD_R1 vs0, vs0, vs34 // realA*realB 3343 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB 3344 3345 XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB 3346 XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 3347 3348 XSFADD_I1 vs1, vs1, vs34 // realA*imagB 3349 XSFADD_I2 vs1, vs1, vs35 // imagA*realB 3350 3351 xsmuldp vs4, vs0, alpha_r // real*alpha_r 3352 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 3353 xsmuldp vs6, vs0, alpha_i // real*alpha_i 3354 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 3355 3356 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 3357 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 3358 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part 3359 3360 3361#ifndef TRMMKERNEL 3362 3363 xvadddp vs8, vs8, vs16 3364 xvadddp vs9, vs9, vs17 3365 3366#endif 3367 3368 stxvd2x vs8, o0, T1 3369 stxvd2x vs9, o16, T1 3370 3371 add T1, T1, LDC 3372 addi CO, CO, 32 3373 3374#if defined(_AIX) 3375') 3376#else 3377.endm 3378#endif 3379 3380 3381/********************************************************************************************** 3382* Macros for N=1 and M=1 3383**********************************************************************************************/ 3384 3385#if defined(_AIX) 3386define(`LOAD1x1_1', ` 3387#else 3388.macro LOAD1x1_1 3389#endif 3390 3391 lxvd2x vs16, o0, BO // load real part from B 3392 lxvd2x vs17, o16, BO // load imag part from B 3393 3394 addi BO, BO, 32 3395 3396 lxvd2x vs0, o0, AO // load real,imag from A 3397 3398 addi AO, AO, 16 3399 3400 3401#if defined(_AIX) 3402') 3403#else 3404.endm 3405#endif 3406 3407#if defined(_AIX) 3408define(`KERNEL1x1_I1', ` 3409#else 3410.macro KERNEL1x1_I1 3411#endif 3412 3413 lxvd2x vs8, o0, AO // load real,imag from A 3414 3415 addi AO, AO, 16 3416 3417 lxvd2x vs20, o0, BO // load real part from B 3418 lxvd2x vs21, o16, BO // load imag part from B 3419 3420 addi BO, BO, 32 3421 3422 xvmuldp vs32, vs0, vs16 // real*real, imag*real 3423 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 3424 3425 3426#if defined(_AIX) 3427') 3428#else 3429.endm 3430#endif 3431 3432#if defined(_AIX) 3433define(`KERNEL1x1_1', ` 3434#else 3435.macro KERNEL1x1_1 3436#endif 3437 3438 lxvd2x vs8, o0, AO // load real,imag from A 3439 3440 addi AO, AO, 16 3441 3442 lxvd2x vs20, o0, BO // load real part from B 3443 lxvd2x vs21, o16, BO // load imag part from B 3444 3445 addi BO, BO, 32 3446 3447 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 3448 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 3449 3450 3451#if defined(_AIX) 3452') 3453#else 3454.endm 3455#endif 3456 3457#if defined(_AIX) 3458define(`KERNEL1x1_2', ` 3459#else 3460.macro KERNEL1x1_2 3461#endif 3462 3463 lxvd2x vs0, o0, AO // load real,imag from A 3464 3465 addi AO, AO, 16 3466 3467 lxvd2x vs16, o0, BO // load real part from B 3468 lxvd2x vs17, o16, BO // load imag part from B 3469 3470 addi BO, BO, 32 3471 3472 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 3473 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 3474 3475 3476#if defined(_AIX) 3477') 3478#else 3479.endm 3480#endif 3481 3482#if defined(_AIX) 3483define(`KERNEL1x1_E2', ` 3484#else 3485.macro KERNEL1x1_E2 3486#endif 3487 3488 3489 xvmaddadp vs32, vs8, vs20 // real*real, imag*real 3490 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag 3491 3492 3493#if defined(_AIX) 3494') 3495#else 3496.endm 3497#endif 3498 3499#if defined(_AIX) 3500define(`KERNEL1x1_SUBI1', ` 3501#else 3502.macro KERNEL1x1_SUBI1 3503#endif 3504 3505 lxvd2x vs0, o0, AO // load real,imag from A 3506 3507 addi AO, AO, 16 3508 3509 lxvd2x vs16, o0, BO // load real part from B 3510 lxvd2x vs17, o16, BO // load imag part from B 3511 3512 addi BO, BO, 32 3513 3514 xvmuldp vs32, vs0, vs16 // real*real, imag*real 3515 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag 3516 3517 3518#if defined(_AIX) 3519') 3520#else 3521.endm 3522#endif 3523 3524#if defined(_AIX) 3525define(`KERNEL1x1_SUB1', ` 3526#else 3527.macro KERNEL1x1_SUB1 3528#endif 3529 3530 lxvd2x vs0, o0, AO // load real,imag from A 3531 3532 addi AO, AO, 16 3533 3534 lxvd2x vs16, o0, BO // load real part from B 3535 lxvd2x vs17, o16, BO // load imag part from B 3536 3537 addi BO, BO, 32 3538 3539 xvmaddadp vs32, vs0, vs16 // real*real, imag*real 3540 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag 3541 3542 3543#if defined(_AIX) 3544') 3545#else 3546.endm 3547#endif 3548 3549#if defined(_AIX) 3550define(`SAVE1x1', ` 3551#else 3552.macro SAVE1x1 3553#endif 3554 3555 3556 mr T1, CO 3557 3558#ifndef TRMMKERNEL 3559 3560 lxvd2x vs16, o0, T1 3561 3562#endif 3563 3564 3565 xxlxor vs0, vs0, vs0 3566 xxlxor vs1, vs1, vs1 3567 XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB 3568 3569 XSFADD_R1 vs0, vs0, vs32 // realA*realB 3570 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB 3571 3572 XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB 3573 XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 3574 3575 XSFADD_I1 vs1, vs1, vs32 // realA*imagB 3576 XSFADD_I2 vs1, vs1, vs33 // imagA*realB 3577 3578 xsmuldp vs4, vs0, alpha_r // real*alpha_r 3579 xsmuldp vs5, vs1, alpha_i // imag*alpha_i 3580 xsmuldp vs6, vs0, alpha_i // real*alpha_i 3581 xsmuldp vs7, vs1, alpha_r // imag*alpha_r 3582 3583 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i 3584 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r 3585 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part 3586 3587 3588#ifndef TRMMKERNEL 3589 3590 xvadddp vs8, vs8, vs16 3591 3592#endif 3593 3594 stxvd2x vs8, o0, T1 3595 3596 add T1, T1, LDC 3597 addi CO, CO, 16 3598 3599#if defined(_AIX) 3600') 3601#else 3602.endm 3603#endif 3604 3605 3606 3607#if defined(_AIX) 3608define(`ZCOPYB_1x1', ` 3609#else 3610.macro ZCOPYB_1x1 3611#endif 3612 3613 lxvdsx vs4, o0, BO // b0_r 3614 lxvdsx vs5, o8, BO // b0_i 3615 addi BO, BO, 16 3616 stxvd2x vs4, o0, BBO 3617 stxvd2x vs5, o16, BBO 3618 addi BBO, BBO, 32 3619 3620#if defined(_AIX) 3621') 3622#else 3623.endm 3624#endif 3625 3626 3627#if defined(_AIX) 3628define(`ZCOPYB_8x1', ` 3629#else 3630.macro ZCOPYB_8x1 3631#endif 3632 3633 lxvd2x vs32, o0, BO 3634 lxvd2x vs33, o16, BO 3635 lxvd2x vs34, o32, BO 3636 lxvd2x vs35, o48, BO 3637 addi BO, BO, 64 3638 3639 lxvd2x vs36, o0, BO 3640 lxvd2x vs37, o16, BO 3641 lxvd2x vs38, o32, BO 3642 lxvd2x vs39, o48, BO 3643 addi BO, BO, 64 3644 3645 XXSPLTD(vs40,vs32,0) 3646 XXSPLTD(vs41,vs32,1) 3647 XXSPLTD(vs42,vs33,0) 3648 XXSPLTD(vs43,vs33,1) 3649 XXSPLTD(vs44,vs34,0) 3650 XXSPLTD(vs45,vs34,1) 3651 XXSPLTD(vs46,vs35,0) 3652 XXSPLTD(vs47,vs35,1) 3653 3654 XXSPLTD(vs48,vs36,0) 3655 XXSPLTD(vs49,vs36,1) 3656 XXSPLTD(vs50,vs37,0) 3657 XXSPLTD(vs51,vs37,1) 3658 XXSPLTD(vs52,vs38,0) 3659 XXSPLTD(vs53,vs38,1) 3660 XXSPLTD(vs54,vs39,0) 3661 XXSPLTD(vs55,vs39,1) 3662 3663 stxvd2x vs40, o0, BBO 3664 stxvd2x vs41, o16, BBO 3665 stxvd2x vs42, o32, BBO 3666 stxvd2x vs43, o48, BBO 3667 addi BBO, BBO, 64 3668 3669 stxvd2x vs44, o0, BBO 3670 stxvd2x vs45, o16, BBO 3671 stxvd2x vs46, o32, BBO 3672 stxvd2x vs47, o48, BBO 3673 addi BBO, BBO, 64 3674 3675 stxvd2x vs48, o0, BBO 3676 stxvd2x vs49, o16, BBO 3677 stxvd2x vs50, o32, BBO 3678 stxvd2x vs51, o48, BBO 3679 addi BBO, BBO, 64 3680 3681 stxvd2x vs52, o0, BBO 3682 stxvd2x vs53, o16, BBO 3683 stxvd2x vs54, o32, BBO 3684 stxvd2x vs55, o48, BBO 3685 addi BBO, BBO, 64 3686 3687#if defined(_AIX) 3688') 3689#else 3690.endm 3691#endif 3692 3693 3694