1! 2! Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 3! 4! Licensed under the Apache License, Version 2.0 (the "License"); 5! you may not use this file except in compliance with the License. 6! You may obtain a copy of the License at 7! 8! http://www.apache.org/licenses/LICENSE-2.0 9! 10! Unless required by applicable law or agreed to in writing, software 11! distributed under the License is distributed on an "AS IS" BASIS, 12! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13! See the License for the specific language governing permissions and 14! limitations under the License. 15! 16 17 18 19#include "mmul_dir.h" 20 21subroutine F90_matmul_int1_str1(dest,s1,s2, & 22 k_extnt,m_extnt,n_extnt, & 23 s1_d1_extnt,s2_d1_extnt,d_d1_extnt, & 24 d_d1_lstride) 25 26 DESC_INT n_extnt,m_extnt,k_extnt 27 DESC_INT s1_d1_extnt,s2_d1_extnt,d_d1_extnt,d_d1_lstride 28 DESC_INT bs 29 parameter (bs=192) 30 INTEGER*1 s1(s1_d1_extnt,m_extnt) 31 INTEGER*1 s2(s2_d1_extnt,k_extnt) 32 INTEGER*1 dest(d_d1_extnt,n_extnt*d_d1_lstride) 33 DESC_INT i, j, l, nmod4, mmod4 34 DESC_INT ii,jj,ll,temppos, nn, kk, itest 35 integer*1 t00,t01,t02,t03 36 integer*1 t10,t11,t12,t13 37 integer*1 t20,t21,t22,t23 38 integer*1 t30,t31,t32,t33 39 integer*1 temp0, temp1, temp2, temp3, temp (bs*bs) 40 integer*1 s20, s21, s22, s23 41 INTEGER flag 42 integer*1 zero 43 parameter (zero = 0) 44 45 DESC_INT k,n,m 46 47 if (d_d1_lstride .eq. 1) then 48 nmod4 = mod (k_extnt, 4) 49 mmod4 = mod (n_extnt, 4) 50 kk = k_extnt - nmod4 51 nn = n_extnt - mmod4 52 itest = nmod4 + mmod4 53 do jj=1,k_extnt,bs 54 do ii=1,n_extnt,bs 55 flag = 0 56 do ll=1,m_extnt,bs 57 temppos = 1 58 if ((k_extnt .ge. 4) .and. (n_extnt .ge. 4)) then 59 if (jj .le. kk) then 60 j = jj 61 do i=ii,min(nn,ii+bs-1),4 62 if (flag .eq. 0) then 63 t00=zero 64 t01=zero 65 t02=zero 66 t03=zero 67 t10=zero 68 t11=zero 69 t12=zero 70 t13=zero 71 t20=zero 72 t21=zero 73 t22=zero 74 t23=zero 75 t30=zero 76 t31=zero 77 t32=zero 78 t33=zero 79 else 80 t00=dest(i+0,j+0) 81 t01=dest(i+0,j+1) 82 t02=dest(i+0,j+2) 83 t03=dest(i+0,j+3) 84 t10=dest(i+1,j+0) 85 t11=dest(i+1,j+1) 86 t12=dest(i+1,j+2) 87 t13=dest(i+1,j+3) 88 t20=dest(i+2,j+0) 89 t21=dest(i+2,j+1) 90 t22=dest(i+2,j+2) 91 t23=dest(i+2,j+3) 92 t30=dest(i+3,j+0) 93 t31=dest(i+3,j+1) 94 t32=dest(i+3,j+2) 95 t33=dest(i+3,j+3) 96 end if 97 do l=ll,min(m_extnt,ll+bs-1) 98 temp0 = s1(i+0,l) 99 temp1 = s1(i+1,l) 100 temp2 = s1(i+2,l) 101 temp3 = s1(i+3,l) 102 s20 = s2(l,j+0) 103 s21 = s2(l,j+1) 104 s22 = s2(l,j+2) 105 s23 = s2(l,j+3) 106 t00=t00+s20*temp0 107 t01=t01+s21*temp0 108 t02=t02+s22*temp0 109 t03=t03+s23*temp0 110 t10=t10+s20*temp1 111 t11=t11+s21*temp1 112 t12=t12+s22*temp1 113 t13=t13+s23*temp1 114 t20=t20+s20*temp2 115 t21=t21+s21*temp2 116 t22=t22+s22*temp2 117 t23=t23+s23*temp2 118 t30=t30+s20*temp3 119 t31=t31+s21*temp3 120 t32=t32+s22*temp3 121 t33=t33+s23*temp3 122 temp (temppos+0) = temp0 123 temp (temppos+1) = temp1 124 temp (temppos+2) = temp2 125 temp (temppos+3) = temp3 126 temppos = temppos + 4 127 end do 128 dest(i+0,j+0)=t00 129 dest(i+0,j+1)=t01 130 dest(i+0,j+2)=t02 131 dest(i+0,j+3)=t03 132 dest(i+1,j+0)=t10 133 dest(i+1,j+1)=t11 134 dest(i+1,j+2)=t12 135 dest(i+1,j+3)=t13 136 dest(i+2,j+0)=t20 137 dest(i+2,j+1)=t21 138 dest(i+2,j+2)=t22 139 dest(i+2,j+3)=t23 140 dest(i+3,j+0)=t30 141 dest(i+3,j+1)=t31 142 dest(i+3,j+2)=t32 143 dest(i+3,j+3)=t33 144 end do 145 do j=jj+4,min(kk,jj+bs-1),4 146 temppos = 1 147 do i=ii,min(nn,ii+bs-1),4 148 if (flag .eq. 0) then 149 t00=zero 150 t01=zero 151 t02=zero 152 t03=zero 153 t10=zero 154 t11=zero 155 t12=zero 156 t13=zero 157 t20=zero 158 t21=zero 159 t22=zero 160 t23=zero 161 t30=zero 162 t31=zero 163 t32=zero 164 t33=zero 165 else 166 t00=dest(i+0,j+0) 167 t01=dest(i+0,j+1) 168 t02=dest(i+0,j+2) 169 t03=dest(i+0,j+3) 170 t10=dest(i+1,j+0) 171 t11=dest(i+1,j+1) 172 t12=dest(i+1,j+2) 173 t13=dest(i+1,j+3) 174 t20=dest(i+2,j+0) 175 t21=dest(i+2,j+1) 176 t22=dest(i+2,j+2) 177 t23=dest(i+2,j+3) 178 t30=dest(i+3,j+0) 179 t31=dest(i+3,j+1) 180 t32=dest(i+3,j+2) 181 t33=dest(i+3,j+3) 182 endif 183 do l=ll,min(m_extnt,ll+bs-1) 184 temp0 = temp (temppos+0) 185 temp1 = temp (temppos+1) 186 temp2 = temp (temppos+2) 187 temp3 = temp (temppos+3) 188 s20 = s2(l,j+0) 189 s21 = s2(l,j+1) 190 s22 = s2(l,j+2) 191 s23 = s2(l,j+3) 192 t00=t00+s20*temp0 193 t01=t01+s21*temp0 194 t02=t02+s22*temp0 195 t03=t03+s23*temp0 196 t10=t10+s20*temp1 197 t11=t11+s21*temp1 198 t12=t12+s22*temp1 199 t13=t13+s23*temp1 200 t20=t20+s20*temp2 201 t21=t21+s21*temp2 202 t22=t22+s22*temp2 203 t23=t23+s23*temp2 204 t30=t30+s20*temp3 205 t31=t31+s21*temp3 206 t32=t32+s22*temp3 207 t33=t33+s23*temp3 208 temppos = temppos + 4 209 end do 210 dest(i+0,j+0)=t00 211 dest(i+0,j+1)=t01 212 dest(i+0,j+2)=t02 213 dest(i+0,j+3)=t03 214 dest(i+1,j+0)=t10 215 dest(i+1,j+1)=t11 216 dest(i+1,j+2)=t12 217 dest(i+1,j+3)=t13 218 dest(i+2,j+0)=t20 219 dest(i+2,j+1)=t21 220 dest(i+2,j+2)=t22 221 dest(i+2,j+3)=t23 222 dest(i+3,j+0)=t30 223 dest(i+3,j+1)=t31 224 dest(i+3,j+2)=t32 225 dest(i+3,j+3)=t33 226 end do 227 end do 228 end if 229 end if 230 if (itest .ne. 0) then 231 if (nmod4 .ne. 0) then 232 temppos = 1 233 do j=kk+1,min(kk+1,jj+bs-1) 234 do i=ii,min(nn,ii+bs-1),4 235 if (flag .eq. 0) then 236 t00=zero 237 t10=zero 238 t20=zero 239 t30=zero 240 else 241 t00=dest(i+0,j+0) 242 t10=dest(i+1,j+0) 243 t20=dest(i+2,j+0) 244 t30=dest(i+3,j+0) 245 end if 246 do l=ll,min(m_extnt,ll+bs-1) 247 temp0 = s1(i+0,l) 248 temp1 = s1(i+1,l) 249 temp2 = s1(i+2,l) 250 temp3 = s1(i+3,l) 251 t00=t00+s2(l,j+0)*temp0 252 t10=t10+s2(l,j+0)*temp1 253 t20=t20+s2(l,j+0)*temp2 254 t30=t30+s2(l,j+0)*temp3 255 temp (temppos+0) = temp0 256 temp (temppos+1) = temp1 257 temp (temppos+2) = temp2 258 temp (temppos+3) = temp3 259 temppos = temppos + 4 260 end do 261 dest(i+0,j+0)=t00 262 dest(i+1,j+0)=t10 263 dest(i+2,j+0)=t20 264 dest(i+3,j+0)=t30 265 end do 266 end do 267 do j=kk+2,min(k_extnt,jj+bs-1) 268 temppos = 1 269 do i=ii,min(nn,ii+bs-1),4 270 if (flag .eq. 0) then 271 t00=zero 272 t10=zero 273 t20=zero 274 t30=zero 275 else 276 t00=dest(i+0,j+0) 277 t10=dest(i+1,j+0) 278 t20=dest(i+2,j+0) 279 t30=dest(i+3,j+0) 280 end if 281 do l=ll,min(m_extnt,ll+bs-1) 282 temp0 = temp (temppos+0) 283 temp1 = temp (temppos+1) 284 temp2 = temp (temppos+2) 285 temp3 = temp (temppos+3) 286 t00=t00+s2(l,j+0)*temp0 287 t10=t10+s2(l,j+0)*temp1 288 t20=t20+s2(l,j+0)*temp2 289 t30=t30+s2(l,j+0)*temp3 290 temppos = temppos + 4 291 end do 292 dest(i+0,j+0)=t00 293 dest(i+1,j+0)=t10 294 dest(i+2,j+0)=t20 295 dest(i+3,j+0)=t30 296 end do 297 end do 298 end if 299 if (mmod4 .ne. 0) then 300 temppos = 1 301 if (jj .le. kk) then 302 j = jj 303 do i=nn+1,min(n_extnt,ii+bs-1) 304 if (flag .eq. 0) then 305 t00=zero 306 t01=zero 307 t02=zero 308 t03=zero 309 else 310 t00=dest(i+0,j+0) 311 t01=dest(i+0,j+1) 312 t02=dest(i+0,j+2) 313 t03=dest(i+0,j+3) 314 end if 315 do l=ll,min(m_extnt,ll+bs-1) 316 temp0 = s1(i+0,l) 317 t00=t00+s2(l,j+0)*temp0 318 t01=t01+s2(l,j+1)*temp0 319 t02=t02+s2(l,j+2)*temp0 320 t03=t03+s2(l,j+3)*temp0 321 temp (temppos) = temp0 322 temppos = temppos + 1 323 end do 324 dest(i+0,j+0)=t00 325 dest(i+0,j+1)=t01 326 dest(i+0,j+2)=t02 327 dest(i+0,j+3)=t03 328 end do 329 do j=jj+4,min(kk,jj+bs-1),4 330 temppos = 1 331 do i=nn+1,min(n_extnt,ii+bs-1) 332 if (flag .eq. 0) then 333 t00=zero 334 t01=zero 335 t02=zero 336 t03=zero 337 else 338 t00=dest(i+0,j+0) 339 t01=dest(i+0,j+1) 340 t02=dest(i+0,j+2) 341 t03=dest(i+0,j+3) 342 end if 343 do l=ll,min(m_extnt,ll+bs-1) 344 temp0 = temp (temppos) 345 t00=t00+s2(l,j+0)*temp0 346 t01=t01+s2(l,j+1)*temp0 347 t02=t02+s2(l,j+2)*temp0 348 t03=t03+s2(l,j+3)*temp0 349 temppos = temppos + 1 350 end do 351 dest(i+0,j+0)=t00 352 dest(i+0,j+1)=t01 353 dest(i+0,j+2)=t02 354 dest(i+0,j+3)=t03 355 end do 356 end do 357 end if 358 end if 359 if ((nmod4 .ne. 0) .and. (mmod4 .ne. 0)) then 360 temppos = 1 361 do j=kk+1,min(kk+1,jj+bs-1) 362 do i=nn+1,min(n_extnt,ii+bs-1) 363 if (flag .eq. 0) then 364 t00=zero 365 else 366 t00=dest(i,j) 367 end if 368 do l=ll,min(m_extnt,ll+bs-1) 369 temp0 = s1(i,l) 370 t00=t00+s2(l,j)*temp0 371 temp (temppos) = temp0 372 temppos = temppos + 1 373 end do 374 dest(i,j)=t00 375 end do 376 end do 377 do j=kk+2,min(k_extnt,jj+bs-1) 378 temppos = 1 379 do i=nn+1,min(n_extnt,ii+bs-1) 380 if (flag .eq. 0) then 381 t00=zero 382 else 383 t00=dest(i,j) 384 end if 385 do l=ll,min(m_extnt,ll+bs-1) 386 temp0 = temp (temppos) 387 t00=t00+s2(l,j)*temp0 388 temppos = temppos + 1 389 end do 390 dest(i,j)=t00 391 end do 392 end do 393 end if 394 end if 395 flag = 1 396 end do 397 end do 398 end do 399 else 400 do k = 1, k_extnt 401 do n = 1, n_extnt 402 dest(1+(n-1)*d_d1_lstride,k) = 0.0d0 403 enddo 404 enddo 405 do k = 1, k_extnt 406 do m = 1, m_extnt 407 do n = 1, n_extnt 408 dest(1+(n-1)*d_d1_lstride,k) = & 409 dest(1+(n-1)*d_d1_lstride,k) + & 410 s1(n,m) * s2(m,k) 411 enddo 412 enddo 413 enddo 414 endif 415 return 416 end 417 418subroutine F90_matmul_int1_str1_mxv(dest, s1,s2, & 419 n_extent,m_extent, ld1,dlstride) 420 421 implicit none 422 DESC_INT n_extent,m_extent,ld1,ld2,dlstride 423 DESC_INT mmod4, mmod2, m2 424 DESC_INT jx,kx,jj,kk,kmod4,k4,incx 425 DESC_INT j0,j1,j2,j3,iy,ky,m4 426 INTEGER*1 t0,t1,t2,t3 427 INTEGER*1 s1(ld1,m_extent) 428 INTEGER*1 s2(m_extent) 429 INTEGER*1 dest(ld1) 430 431 DESC_INT i,j,k 432 INTEGER bs 433 parameter (bs = 384) 434 INTEGER*1 temp (bs) 435 DESC_INT ind(bs) 436 INTEGER*1 zero 437 parameter (zero = 0) 438 439 if (dlstride .eq. 1) then 440 do k = 1, n_extent 441 dest(k) = 0.0d0 442 end do 443 mmod4 = mod(n_extent, 4) 444 mmod2 = mod(n_extent, 2) 445 m4 = n_extent - mmod4 446 m2 = n_extent- mmod2 447 kx = 1 448 ky = 1 449 incx = 1 450 jx = kx 451 do jj = 1, m_extent, bs 452 jx = kx + (jj-1) 453 kk = 0 454 do j = jj, min (m_extent, jj+bs-1) 455 if (s2(jx) .ne. zero) then 456 kk = kk + 1 457 temp(kk) = s2(jx) 458 ind(kk) = j 459 end if 460 jx = jx + 1 461 end do 462 kmod4 = mod(kk, 4) 463 k4 = kk - kmod4 464 do j = 1, k4, 4 465 t0 = temp(j) 466 t1 = temp(j+1) 467 t2 = temp(j+2) 468 t3 = temp(j+3) 469 j0 = ind(j) 470 j1 = ind(j+1) 471 j2 = ind(j+2) 472 j3 = ind(j+3) 473 iy = ky 474 do i = 1, m4, 4 475 dest( iy ) = dest( iy )+t0*s1(i, j0) & 476 + t1*s1(i, j1) & 477 + t2*s1(i, j2) & 478 + t3*s1(i, j3) 479 480 dest(iy+1) = dest(iy+1)+t0*s1(i+1, j0) & 481 + t1*s1(i+1, j1) & 482 + t2*s1(i+1, j2) & 483 + t3*s1(i+1, j3) 484 485 dest(iy+2) = dest(iy+2)+t0*s1(i+2, j0) & 486 + t1*s1(i+2, j1) & 487 + t2*s1(i+2, j2) & 488 + t3*s1(i+2, j3) 489 490 dest(iy+3) = dest(iy+3 )+t0*s1(i+3, j0) & 491 + t1*s1(i+3, j1) & 492 + t2*s1(i+3, j2) & 493 + t3*s1(i+3, j3) 494 iy = iy + 4 495 end do 496 iy = ky + m4 497 do i = m4 + 1, n_extent 498 dest(iy) = dest(iy) + t0*s1(i, j0) & 499 + t1*s1(i, j1) & 500 + t2*s1(i, j2) & 501 + t3*s1(i, j3) 502 iy = iy + 1 503 end do 504 end do 505 do j = k4+1, kk 506 t0 = temp(j) 507 j0 = ind(j) 508 iy = ky 509 do i = 1, m4, 4 510 dest( iy ) = dest(iy) +t0*s1(i,j0) 511 dest( iy+1) = dest(iy+1) +t0*s1(i+1,j0) 512 dest( iy+2) = dest(iy+2)+t0*s1(i+2,j0) 513 dest( iy+3) = dest(iy+3)+t0*s1(i+3,j0) 514 iy = iy + 4 515 end do 516 end do 517 do j = k4+1, kk 518 t0 = temp(j) 519 j0 = ind(j) 520 iy = ky + m4 521 do i = m4 + 1, n_extent 522 dest(iy) = dest(iy) + t0 * s1(i, j0) 523 iy = iy + 1 524 end do 525 end do 526 527 end do 528 else 529 do k = 1, n_extent 530 dest(1+(k-1)*dlstride) = 0.0d0 531 enddo 532 mmod4 = mod(n_extent, 4) 533 mmod2 = mod(n_extent, 2) 534 m4 = n_extent - mmod4 535 m2 = n_extent- mmod2 536 kx = 1 537 ky = 1 538 incx = 1 539 jx = kx 540 do jj = 1, m_extent, bs 541 jx = kx + (jj-1) 542 kk = 0 543 do j = jj, min (m_extent, jj+bs-1) 544 if (s2(jx) .ne. zero) then 545 kk = kk + 1 546 temp(kk) = s2(jx) 547 ind(kk) = j 548 end if 549 jx = jx + 1 550 end do 551 kmod4 = mod(kk, 4) 552 k4 = kk - kmod4 553 do j = 1, k4, 4 554 t0 = temp(j) 555 t1 = temp(j+1) 556 t2 = temp(j+2) 557 t3 = temp(j+3) 558 j0 = ind(j) 559 j1 = ind(j+1) 560 j2 = ind(j+2) 561 j3 = ind(j+3) 562 iy = ky 563 do i = 1, m4, 4 564 dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0*s1(i, j0) & 565 + t1*s1(i, j1) & 566 + t2*s1(i, j2) & 567 + t3*s1(i, j3) 568 569 dest(1+(iy + 1 -1)*dlstride) = dest(1+(iy + 1 -1)*dlstride) + t0*s1(i+1, j0) & 570 + t1*s1(i+1, j1) & 571 + t2*s1(i+1, j2) & 572 + t3*s1(i+1, j3) 573 574 dest(1+(iy + 2 -1)*dlstride) = dest(1+(iy + 2 -1)*dlstride) + t0*s1(i+2, j0) & 575 + t1*s1(i+2, j1) & 576 + t2*s1(i+2, j2) & 577 + t3*s1(i+2, j3) 578 579 dest(1+(iy + 3 -1)*dlstride) = dest(1+(iy + 3 -1)*dlstride) + t0*s1(i+3, j0) & 580 + t1*s1(i+3, j1) & 581 + t2*s1(i+3, j2) & 582 + t3*s1(i+3, j3) 583 iy = iy + 4 584 end do 585 iy = ky + m4 586 do i = m4 + 1, n_extent 587 dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0*s1(i, j0) & 588 + t1*s1(i, j1) & 589 + t2*s1(i, j2) & 590 + t3*s1(i, j3) 591 iy = iy + 1 592 end do 593 end do 594 do j = k4+1, kk 595 t0 = temp(j) 596 j0 = ind(j) 597 iy = ky 598 do i = 1, m4, 4 599 dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0*s1(i,j0) 600 dest(1+(iy + 1 -1)*dlstride) = dest(1+(iy + 1 -1)*dlstride) + t0*s1(i+1,j0) 601 dest(1+(iy + 2 -1)*dlstride) = dest(1+(iy + 2 -1)*dlstride) + t0*s1(i+2,j0) 602 dest(1+(iy + 3 -1)*dlstride) = dest(1+(iy + 3 -1)*dlstride) + t0*s1(i+3,j0) 603 iy = iy + 4 604 end do 605 end do 606 do j = k4+1, kk 607 t0 = temp(j) 608 j0 = ind(j) 609 iy = ky + m4 610 do i = m4 + 1, n_extent 611 dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0 * s1(i, j0) 612 iy = iy + 1 613 end do 614 end do 615 end do 616 endif 617end 618 619subroutine F90_matmul_int1_str1_vxm(dest, s1,s2, & 620 k_extent,m_extent, ld1,dlstride) 621 622 implicit none 623 DESC_INT k_extent,m_extent,ld1,ld2,dlstride 624 INTEGER*1 s1(m_extent) 625 INTEGER*1 s2(ld1,k_extent) 626 INTEGER*1 dest(k_extent) 627 628 INTEGER bs 629 parameter (bs = 384) 630 INTEGER*1 temp (bs) 631 INTEGER*1 t0,t1,t2,t3 632 INTEGER*1 t4,t5,t6,t7 633 INTEGER*1 dtemp0, dtemp1, dtemp2, dtemp3 634 INTEGER*1 dtemp4, dtemp5, dtemp6, dtemp7 635 DESC_INT ind(bs),ind0,ind1,ind2,ind3 636 DESC_INT ind4,ind5,ind6,ind7 637 INTEGER*1 zero 638 parameter (zero = 0) 639 640 DESC_INT mi,ki,ti 641 DESC_INT ii,jj,j,jx,kk,jnext 642 DESC_INT mmod8,m8,kmod8,k8,tmod8,tt8 643 DESC_INT mmod4,m4,kmod4,k4,tmod4,tt4 644 645 if (dlstride .eq. 1) then 646 do ki = 1, k_extent 647 dest(ki) = 0 648 end do 649 650 mmod8 = mod(m_extent,8) 651 m8 = m_extent - mmod8 652 kmod8 = mod(k_extent,8) 653 k8 = k_extent - kmod8 654 655 do kk = 1,k8,8 656 dtemp0 = dest(kk) 657 dtemp1 = dest(kk+1) 658 dtemp2 = dest(kk+2) 659 dtemp3 = dest(kk+3) 660 dtemp4 = dest(kk+4) 661 dtemp5 = dest(kk+5) 662 dtemp6 = dest(kk+6) 663 dtemp7 = dest(kk+7) 664 jnext = 1 665 do jj = 1,m8,bs 666 ! load s1 temp vector 667 ti = 0 668 jx = jj 669 do j = jj, min (m_extent, jj+bs-1) 670 if (s1(jx) .ne. zero) then 671 ti = ti + 1 672 temp(ti) = s1(jx) 673 ind(ti) = j 674 end if 675 jx = jx + 1 676 end do 677 678 tmod8 = mod(ti,8) 679 tt8 = ti - tmod8 680 681 if (tt8 .ne. 0) then 682 jnext = ind(tt8)+1 683 endif 684 685 ti = 1 686 do ii = 1,tt8,8 687 t0 = temp(ti) 688 t1 = temp(ti+1) 689 t2 = temp(ti+2) 690 t3 = temp(ti+3) 691 t4 = temp(ti+4) 692 t5 = temp(ti+5) 693 t6 = temp(ti+6) 694 t7 = temp(ti+7) 695 ind0 = ind(ti) 696 ind1 = ind(ti+1) 697 ind2 = ind(ti+2) 698 ind3 = ind(ti+3) 699 ind4 = ind(ti+4) 700 ind5 = ind(ti+5) 701 ind6 = ind(ti+6) 702 ind7 = ind(ti+7) 703 704 dtemp0 = dtemp0 + & 705 t0 * s2(ind0, kk) + t1 * s2(ind1, kk) + & 706 t2 * s2(ind2, kk) + t3 * s2(ind3, kk) + & 707 t4 * s2(ind4, kk) + t5 * s2(ind5, kk) + & 708 t6 * s2(ind6, kk) + t7 * s2(ind7, kk) 709 dtemp1 = dtemp1 + & 710 t0 * s2(ind0, kk+1) + t1 * s2(ind1, kk+1) + & 711 t2 * s2(ind2, kk+1) + t3 * s2(ind3, kk+1) + & 712 t4 * s2(ind4, kk+1) + t5 * s2(ind5, kk+1) + & 713 t6 * s2(ind6, kk+1) + t7 * s2(ind7, kk+1) 714 dtemp2 = dtemp2 + & 715 t0 * s2(ind0, kk+2) + t1 * s2(ind1, kk+2) + & 716 t2 * s2(ind2, kk+2) + t3 * s2(ind3, kk+2) + & 717 t4 * s2(ind4, kk+2) + t5 * s2(ind5, kk+2) + & 718 t6 * s2(ind6, kk+2) + t7 * s2(ind7, kk+2) 719 dtemp3 = dtemp3 + & 720 t0 * s2(ind0, kk+3) + t1 * s2(ind1, kk+3) + & 721 t2 * s2(ind2, kk+3) + t3 * s2(ind3, kk+3) + & 722 t4 * s2(ind4, kk+3) + t5 * s2(ind5, kk+3) + & 723 t6 * s2(ind6, kk+3) + t7 * s2(ind7, kk+3) 724 725 dtemp4 = dtemp4 + & 726 t0 * s2(ind0, kk+4) + t1 * s2(ind1, kk+4) + & 727 t2 * s2(ind2, kk+4) + t3 * s2(ind3, kk+4) + & 728 t4 * s2(ind4, kk+4) + t5 * s2(ind5, kk+4) + & 729 t6 * s2(ind6, kk+4) + t7 * s2(ind7, kk+4) 730 dtemp5 = dtemp5 + & 731 t0 * s2(ind0, kk+5) + t1 * s2(ind1, kk+5) + & 732 t2 * s2(ind2, kk+5) + t3 * s2(ind3, kk+5) + & 733 t4 * s2(ind4, kk+5) + t5 * s2(ind5, kk+5) + & 734 t6 * s2(ind6, kk+5) + t7 * s2(ind7, kk+5) 735 dtemp6 = dtemp6 + & 736 t0 * s2(ind0, kk+6) + t1 * s2(ind1, kk+6) + & 737 t2 * s2(ind2, kk+6) + t3 * s2(ind3, kk+6) + & 738 t4 * s2(ind4, kk+6) + t5 * s2(ind5, kk+6) + & 739 t6 * s2(ind6, kk+6) + t7 * s2(ind7, kk+6) 740 dtemp7 = dtemp7 + & 741 t0 * s2(ind0, kk+7) + t1 * s2(ind1, kk+7) + & 742 t2 * s2(ind2, kk+7) + t3 * s2(ind3, kk+7) + & 743 t4 * s2(ind4, kk+7) + t5 * s2(ind5, kk+7) + & 744 t6 * s2(ind6, kk+7) + t7 * s2(ind7, kk+7) 745 ti = ti + 8 746 enddo 747 enddo 748 do ii = jnext,m_extent 749 t0 = s1(ii) 750 dtemp0 = dtemp0 + t0 * s2(ii,kk) 751 dtemp1 = dtemp1 + t0 * s2(ii,kk+1) 752 dtemp2 = dtemp2 + t0 * s2(ii,kk+2) 753 dtemp3 = dtemp3 + t0 * s2(ii,kk+3) 754 dtemp4 = dtemp4 + t0 * s2(ii,kk+4) 755 dtemp5 = dtemp5 + t0 * s2(ii,kk+5) 756 dtemp6 = dtemp6 + t0 * s2(ii,kk+6) 757 dtemp7 = dtemp7 + t0 * s2(ii,kk+7) 758 enddo 759 dest(kk) = dtemp0 760 dest(kk+1) = dtemp1 761 dest(kk+2) = dtemp2 762 dest(kk+3) = dtemp3 763 dest(kk+4) = dtemp4 764 dest(kk+5) = dtemp5 765 dest(kk+6) = dtemp6 766 dest(kk+7) = dtemp7 767 enddo 768 do kk = k8+1,k_extent 769 dtemp0 = dest(kk) 770 do ii = 1,m_extent 771 dtemp0 = dtemp0 + s1(ii) * s2(ii,kk) 772 enddo 773 dest(kk) = dtemp0 774 enddo 775 776 777 else 778 do kk = 1, k_extent 779 dest(1+(kk-1)*dlstride) = 0 780 end do 781 782 mmod4 = mod(m_extent,4) 783 m4 = m_extent - mmod4 784 kmod4 = mod(k_extent,4) 785 k4 = k_extent - kmod4 786 787 do kk = 1,k4,4 788 dtemp0 = dest(1+(kk-1)*dlstride) 789 dtemp1 = dest(1+(kk)*dlstride) 790 dtemp2 = dest(1+(kk+1)*dlstride) 791 dtemp3 = dest(1+(kk+2)*dlstride) 792 jnext = 1 793 do jj = 1,m4,bs 794 ! load s1 temp vector 795 ti = 0 796 jx = jj 797 do j = jj, min (m_extent, jj+bs-1) 798 if (s1(jx) .ne. zero) then 799 ti = ti + 1 800 temp(ti) = s1(jx) 801 ind(ti) = j 802 end if 803 jx = jx + 1 804 end do 805 806 tmod4 = mod(ti,4) 807 tt4 = ti - tmod4 808 809 if (tt4 .ne. 0) then 810 jnext = ind(tt4)+1 811 endif 812 813 ti = 1 814 do ii = 1,tt4,4 815 t0 = temp(ti) 816 t1 = temp(ti+1) 817 t2 = temp(ti+2) 818 t3 = temp(ti+3) 819 ind0 = ind(ti) 820 ind1 = ind(ti+1) 821 ind2 = ind(ti+2) 822 ind3 = ind(ti+3) 823 824 dtemp0 = dtemp0 + & 825 t0 * s2(ind0, kk) + t1 * s2(ind1, kk) + & 826 t2 * s2(ind2, kk) + t3 * s2(ind3, kk) 827 dtemp1 = dtemp1 + & 828 t0 * s2(ind0, kk+1) + t1 * s2(ind1, kk+1) + & 829 t2 * s2(ind2, kk+1) + t3 * s2(ind3, kk+1) 830 dtemp2 = dtemp2 + & 831 t0 * s2(ind0, kk+2) + t1 * s2(ind1, kk+2) + & 832 t2 * s2(ind2, kk+2) + t3 * s2(ind3, kk+2) 833 dtemp3 = dtemp3 + & 834 t0 * s2(ind0, kk+3) + t1 * s2(ind1, kk+3) + & 835 t2 * s2(ind2, kk+3) + t3 * s2(ind3, kk+3) 836 837 ti = ti + 4 838 enddo 839 enddo 840 do ii = jnext,m_extent 841 t0 = s1(ii) 842 dtemp0 = dtemp0 + t0 * s2(ii,kk) 843 dtemp1 = dtemp1 + t0 * s2(ii,kk+1) 844 dtemp2 = dtemp2 + t0 * s2(ii,kk+2) 845 dtemp3 = dtemp3 + t0 * s2(ii,kk+3) 846 enddo 847 dest(1+(kk-1)*dlstride) = dtemp0 848 dest(1+(kk)*dlstride) = dtemp1 849 dest(1+(kk+1)*dlstride) = dtemp2 850 dest(1+(kk+2)*dlstride) = dtemp3 851 enddo 852 do kk = k4+1,k_extent 853 dtemp0 = dest(1+(kk-1)*dlstride) 854 do ii = 1,m_extent 855 dtemp0 = dtemp0 + s1(ii) * s2(ii,kk) 856 enddo 857 dest(1+(kk-1)*dlstride) = dtemp0 858 enddo 859 endif 860end 861 862