1!
2! Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
3!
4! Licensed under the Apache License, Version 2.0 (the "License");
5! you may not use this file except in compliance with the License.
6! You may obtain a copy of the License at
7!
8!     http://www.apache.org/licenses/LICENSE-2.0
9!
10! Unless required by applicable law or agreed to in writing, software
11! distributed under the License is distributed on an "AS IS" BASIS,
12! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13! See the License for the specific language governing permissions and
14! limitations under the License.
15!
16
17
18
19#include "mmul_dir.h"
20
21subroutine F90_matmul_int1_str1(dest,s1,s2, &
22      k_extnt,m_extnt,n_extnt,                   &
23      s1_d1_extnt,s2_d1_extnt,d_d1_extnt,        &
24      d_d1_lstride)
25
26      DESC_INT  n_extnt,m_extnt,k_extnt
27      DESC_INT  s1_d1_extnt,s2_d1_extnt,d_d1_extnt,d_d1_lstride
28      DESC_INT  bs
29      parameter (bs=192)
30      INTEGER*1 s1(s1_d1_extnt,m_extnt)
31      INTEGER*1 s2(s2_d1_extnt,k_extnt)
32      INTEGER*1 dest(d_d1_extnt,n_extnt*d_d1_lstride)
33      DESC_INT  i, j, l, nmod4, mmod4
34      DESC_INT  ii,jj,ll,temppos, nn, kk, itest
35      integer*1   t00,t01,t02,t03
36      integer*1   t10,t11,t12,t13
37      integer*1   t20,t21,t22,t23
38      integer*1   t30,t31,t32,t33
39      integer*1 temp0, temp1, temp2, temp3, temp (bs*bs)
40      integer*1 s20, s21, s22, s23
41      INTEGER   flag
42      integer*1 zero
43      parameter (zero = 0)
44
45      DESC_INT  k,n,m
46
47      if (d_d1_lstride .eq. 1) then
48         nmod4 = mod (k_extnt, 4)
49         mmod4 = mod (n_extnt, 4)
50         kk = k_extnt - nmod4
51         nn = n_extnt - mmod4
52         itest = nmod4 + mmod4
53         do jj=1,k_extnt,bs
54          do ii=1,n_extnt,bs
55           flag = 0
56           do ll=1,m_extnt,bs
57            temppos = 1
58            if ((k_extnt .ge. 4) .and. (n_extnt .ge. 4)) then
59            if (jj .le. kk) then
60             j = jj
61             do i=ii,min(nn,ii+bs-1),4
62              if (flag .eq. 0) then
63               t00=zero
64               t01=zero
65               t02=zero
66               t03=zero
67               t10=zero
68               t11=zero
69               t12=zero
70               t13=zero
71               t20=zero
72               t21=zero
73               t22=zero
74               t23=zero
75               t30=zero
76               t31=zero
77               t32=zero
78               t33=zero
79              else
80               t00=dest(i+0,j+0)
81               t01=dest(i+0,j+1)
82               t02=dest(i+0,j+2)
83               t03=dest(i+0,j+3)
84               t10=dest(i+1,j+0)
85               t11=dest(i+1,j+1)
86               t12=dest(i+1,j+2)
87               t13=dest(i+1,j+3)
88               t20=dest(i+2,j+0)
89               t21=dest(i+2,j+1)
90               t22=dest(i+2,j+2)
91               t23=dest(i+2,j+3)
92               t30=dest(i+3,j+0)
93               t31=dest(i+3,j+1)
94               t32=dest(i+3,j+2)
95               t33=dest(i+3,j+3)
96              end if
97              do l=ll,min(m_extnt,ll+bs-1)
98               temp0 = s1(i+0,l)
99               temp1 = s1(i+1,l)
100               temp2 = s1(i+2,l)
101               temp3 = s1(i+3,l)
102               s20 = s2(l,j+0)
103               s21 = s2(l,j+1)
104               s22 = s2(l,j+2)
105               s23 = s2(l,j+3)
106               t00=t00+s20*temp0
107               t01=t01+s21*temp0
108               t02=t02+s22*temp0
109               t03=t03+s23*temp0
110               t10=t10+s20*temp1
111               t11=t11+s21*temp1
112               t12=t12+s22*temp1
113               t13=t13+s23*temp1
114               t20=t20+s20*temp2
115               t21=t21+s21*temp2
116               t22=t22+s22*temp2
117               t23=t23+s23*temp2
118               t30=t30+s20*temp3
119               t31=t31+s21*temp3
120               t32=t32+s22*temp3
121               t33=t33+s23*temp3
122               temp (temppos+0) = temp0
123               temp (temppos+1) = temp1
124               temp (temppos+2) = temp2
125               temp (temppos+3) = temp3
126               temppos = temppos + 4
127              end do
128              dest(i+0,j+0)=t00
129              dest(i+0,j+1)=t01
130              dest(i+0,j+2)=t02
131              dest(i+0,j+3)=t03
132              dest(i+1,j+0)=t10
133              dest(i+1,j+1)=t11
134              dest(i+1,j+2)=t12
135              dest(i+1,j+3)=t13
136              dest(i+2,j+0)=t20
137              dest(i+2,j+1)=t21
138              dest(i+2,j+2)=t22
139              dest(i+2,j+3)=t23
140              dest(i+3,j+0)=t30
141              dest(i+3,j+1)=t31
142              dest(i+3,j+2)=t32
143              dest(i+3,j+3)=t33
144             end do
145            do j=jj+4,min(kk,jj+bs-1),4
146             temppos = 1
147             do i=ii,min(nn,ii+bs-1),4
148              if (flag .eq. 0) then
149               t00=zero
150               t01=zero
151               t02=zero
152               t03=zero
153               t10=zero
154               t11=zero
155               t12=zero
156               t13=zero
157               t20=zero
158               t21=zero
159               t22=zero
160               t23=zero
161               t30=zero
162               t31=zero
163               t32=zero
164               t33=zero
165              else
166               t00=dest(i+0,j+0)
167               t01=dest(i+0,j+1)
168               t02=dest(i+0,j+2)
169               t03=dest(i+0,j+3)
170               t10=dest(i+1,j+0)
171               t11=dest(i+1,j+1)
172               t12=dest(i+1,j+2)
173               t13=dest(i+1,j+3)
174               t20=dest(i+2,j+0)
175               t21=dest(i+2,j+1)
176               t22=dest(i+2,j+2)
177               t23=dest(i+2,j+3)
178               t30=dest(i+3,j+0)
179               t31=dest(i+3,j+1)
180               t32=dest(i+3,j+2)
181               t33=dest(i+3,j+3)
182              endif
183              do l=ll,min(m_extnt,ll+bs-1)
184               temp0 = temp (temppos+0)
185               temp1 = temp (temppos+1)
186               temp2 = temp (temppos+2)
187               temp3 = temp (temppos+3)
188               s20 = s2(l,j+0)
189               s21 = s2(l,j+1)
190               s22 = s2(l,j+2)
191               s23 = s2(l,j+3)
192               t00=t00+s20*temp0
193               t01=t01+s21*temp0
194               t02=t02+s22*temp0
195               t03=t03+s23*temp0
196               t10=t10+s20*temp1
197               t11=t11+s21*temp1
198               t12=t12+s22*temp1
199               t13=t13+s23*temp1
200               t20=t20+s20*temp2
201               t21=t21+s21*temp2
202               t22=t22+s22*temp2
203               t23=t23+s23*temp2
204               t30=t30+s20*temp3
205               t31=t31+s21*temp3
206               t32=t32+s22*temp3
207               t33=t33+s23*temp3
208               temppos = temppos + 4
209              end do
210              dest(i+0,j+0)=t00
211              dest(i+0,j+1)=t01
212              dest(i+0,j+2)=t02
213              dest(i+0,j+3)=t03
214              dest(i+1,j+0)=t10
215              dest(i+1,j+1)=t11
216              dest(i+1,j+2)=t12
217              dest(i+1,j+3)=t13
218              dest(i+2,j+0)=t20
219              dest(i+2,j+1)=t21
220              dest(i+2,j+2)=t22
221              dest(i+2,j+3)=t23
222              dest(i+3,j+0)=t30
223              dest(i+3,j+1)=t31
224              dest(i+3,j+2)=t32
225              dest(i+3,j+3)=t33
226             end do
227            end do
228            end if
229            end if
230            if (itest .ne. 0) then
231            if (nmod4 .ne. 0) then
232             temppos = 1
233             do j=kk+1,min(kk+1,jj+bs-1)
234              do i=ii,min(nn,ii+bs-1),4
235               if (flag .eq. 0) then
236                t00=zero
237                t10=zero
238                t20=zero
239                t30=zero
240               else
241                t00=dest(i+0,j+0)
242                t10=dest(i+1,j+0)
243                t20=dest(i+2,j+0)
244                t30=dest(i+3,j+0)
245               end if
246               do l=ll,min(m_extnt,ll+bs-1)
247                temp0 = s1(i+0,l)
248                temp1 = s1(i+1,l)
249                temp2 = s1(i+2,l)
250                temp3 = s1(i+3,l)
251                t00=t00+s2(l,j+0)*temp0
252                t10=t10+s2(l,j+0)*temp1
253                t20=t20+s2(l,j+0)*temp2
254                t30=t30+s2(l,j+0)*temp3
255                temp (temppos+0) = temp0
256                temp (temppos+1) = temp1
257                temp (temppos+2) = temp2
258                temp (temppos+3) = temp3
259                temppos = temppos + 4
260               end do
261               dest(i+0,j+0)=t00
262               dest(i+1,j+0)=t10
263               dest(i+2,j+0)=t20
264               dest(i+3,j+0)=t30
265              end do
266             end do
267             do j=kk+2,min(k_extnt,jj+bs-1)
268              temppos = 1
269              do i=ii,min(nn,ii+bs-1),4
270               if (flag .eq. 0) then
271                t00=zero
272                t10=zero
273                t20=zero
274                t30=zero
275               else
276                t00=dest(i+0,j+0)
277                t10=dest(i+1,j+0)
278                t20=dest(i+2,j+0)
279                t30=dest(i+3,j+0)
280               end if
281               do l=ll,min(m_extnt,ll+bs-1)
282                temp0 = temp (temppos+0)
283                temp1 = temp (temppos+1)
284                temp2 = temp (temppos+2)
285                temp3 = temp (temppos+3)
286                t00=t00+s2(l,j+0)*temp0
287                t10=t10+s2(l,j+0)*temp1
288                t20=t20+s2(l,j+0)*temp2
289                t30=t30+s2(l,j+0)*temp3
290                temppos = temppos + 4
291               end do
292               dest(i+0,j+0)=t00
293               dest(i+1,j+0)=t10
294               dest(i+2,j+0)=t20
295               dest(i+3,j+0)=t30
296              end do
297             end do
298            end if
299            if (mmod4 .ne. 0) then
300             temppos = 1
301             if (jj .le. kk) then
302             j = jj
303             do i=nn+1,min(n_extnt,ii+bs-1)
304              if (flag .eq. 0) then
305               t00=zero
306               t01=zero
307               t02=zero
308               t03=zero
309              else
310               t00=dest(i+0,j+0)
311               t01=dest(i+0,j+1)
312               t02=dest(i+0,j+2)
313               t03=dest(i+0,j+3)
314              end if
315              do l=ll,min(m_extnt,ll+bs-1)
316               temp0 = s1(i+0,l)
317               t00=t00+s2(l,j+0)*temp0
318               t01=t01+s2(l,j+1)*temp0
319               t02=t02+s2(l,j+2)*temp0
320               t03=t03+s2(l,j+3)*temp0
321               temp (temppos) = temp0
322               temppos = temppos + 1
323              end do
324              dest(i+0,j+0)=t00
325              dest(i+0,j+1)=t01
326              dest(i+0,j+2)=t02
327              dest(i+0,j+3)=t03
328             end do
329             do j=jj+4,min(kk,jj+bs-1),4
330              temppos = 1
331              do i=nn+1,min(n_extnt,ii+bs-1)
332               if (flag .eq. 0) then
333                t00=zero
334                t01=zero
335                t02=zero
336                t03=zero
337               else
338                t00=dest(i+0,j+0)
339                t01=dest(i+0,j+1)
340                t02=dest(i+0,j+2)
341                t03=dest(i+0,j+3)
342               end if
343               do l=ll,min(m_extnt,ll+bs-1)
344                temp0 = temp (temppos)
345                t00=t00+s2(l,j+0)*temp0
346                t01=t01+s2(l,j+1)*temp0
347                t02=t02+s2(l,j+2)*temp0
348                t03=t03+s2(l,j+3)*temp0
349                temppos = temppos + 1
350               end do
351               dest(i+0,j+0)=t00
352               dest(i+0,j+1)=t01
353               dest(i+0,j+2)=t02
354               dest(i+0,j+3)=t03
355              end do
356             end do
357             end if
358            end if
359            if ((nmod4 .ne. 0) .and. (mmod4 .ne. 0)) then
360             temppos = 1
361             do j=kk+1,min(kk+1,jj+bs-1)
362              do i=nn+1,min(n_extnt,ii+bs-1)
363               if (flag .eq. 0) then
364                t00=zero
365               else
366                t00=dest(i,j)
367               end if
368               do l=ll,min(m_extnt,ll+bs-1)
369                temp0 = s1(i,l)
370                t00=t00+s2(l,j)*temp0
371                temp (temppos) = temp0
372                temppos = temppos + 1
373               end do
374               dest(i,j)=t00
375              end do
376             end do
377             do j=kk+2,min(k_extnt,jj+bs-1)
378              temppos = 1
379              do i=nn+1,min(n_extnt,ii+bs-1)
380               if (flag .eq. 0) then
381                t00=zero
382               else
383                t00=dest(i,j)
384               end if
385               do l=ll,min(m_extnt,ll+bs-1)
386                temp0 = temp (temppos)
387                t00=t00+s2(l,j)*temp0
388                temppos = temppos + 1
389               end do
390               dest(i,j)=t00
391              end do
392             end do
393            end if
394            end if
395            flag = 1
396           end do
397          end do
398         end do
399      else
400         do k = 1, k_extnt
401            do n = 1, n_extnt
402               dest(1+(n-1)*d_d1_lstride,k) = 0.0d0
403            enddo
404         enddo
405         do k = 1, k_extnt
406           do m = 1, m_extnt
407               do n = 1, n_extnt
408                  dest(1+(n-1)*d_d1_lstride,k) =                &
409                                 dest(1+(n-1)*d_d1_lstride,k) + &
410                                            s1(n,m) * s2(m,k)
411               enddo
412            enddo
413         enddo
414      endif
415      return
416      end
417
418subroutine F90_matmul_int1_str1_mxv(dest, s1,s2,  &
419                    n_extent,m_extent, ld1,dlstride)
420
421   implicit none
422   DESC_INT  n_extent,m_extent,ld1,ld2,dlstride
423   DESC_INT  mmod4, mmod2, m2
424   DESC_INT  jx,kx,jj,kk,kmod4,k4,incx
425   DESC_INT  j0,j1,j2,j3,iy,ky,m4
426   INTEGER*1 t0,t1,t2,t3
427   INTEGER*1 s1(ld1,m_extent)
428   INTEGER*1 s2(m_extent)
429   INTEGER*1 dest(ld1)
430
431   DESC_INT  i,j,k
432   INTEGER   bs
433   parameter (bs = 384)
434   INTEGER*1 temp (bs)
435   DESC_INT  ind(bs)
436   INTEGER*1 zero
437   parameter (zero = 0)
438
439   if (dlstride .eq. 1) then
440         do k = 1, n_extent
441            dest(k) = 0.0d0
442         end do
443         mmod4 = mod(n_extent, 4)
444         mmod2 = mod(n_extent, 2)
445         m4 = n_extent - mmod4
446         m2 = n_extent- mmod2
447         kx = 1
448         ky = 1
449         incx = 1
450         jx = kx
451         do jj = 1, m_extent, bs
452            jx = kx + (jj-1)
453            kk = 0
454            do j = jj, min (m_extent, jj+bs-1)
455               if (s2(jx) .ne. zero) then
456                  kk = kk + 1
457                  temp(kk) = s2(jx)
458                  ind(kk) = j
459               end if
460               jx = jx + 1
461            end do
462            kmod4 = mod(kk, 4)
463            k4 = kk - kmod4
464            do j = 1, k4, 4
465               t0 = temp(j)
466               t1 = temp(j+1)
467               t2 = temp(j+2)
468               t3 = temp(j+3)
469               j0 = ind(j)
470               j1 = ind(j+1)
471               j2 = ind(j+2)
472               j3 = ind(j+3)
473               iy = ky
474               do i = 1, m4, 4
475                  dest( iy ) = dest( iy )+t0*s1(i, j0) &
476                           + t1*s1(i, j1) &
477                           + t2*s1(i, j2) &
478                           + t3*s1(i, j3)
479
480                  dest(iy+1) = dest(iy+1)+t0*s1(i+1, j0) &
481                               + t1*s1(i+1, j1) &
482                               + t2*s1(i+1, j2) &
483                               + t3*s1(i+1, j3)
484
485                  dest(iy+2) = dest(iy+2)+t0*s1(i+2, j0) &
486                               + t1*s1(i+2, j1) &
487                               + t2*s1(i+2, j2) &
488                               + t3*s1(i+2, j3)
489
490                  dest(iy+3) = dest(iy+3 )+t0*s1(i+3, j0) &
491                               + t1*s1(i+3, j1) &
492                               + t2*s1(i+3, j2) &
493                               + t3*s1(i+3, j3)
494                  iy = iy + 4
495               end do
496               iy = ky + m4
497               do i = m4 + 1, n_extent
498                  dest(iy) = dest(iy) + t0*s1(i, j0) &
499                           + t1*s1(i, j1) &
500                           + t2*s1(i, j2) &
501                           + t3*s1(i, j3)
502                  iy = iy + 1
503               end do
504            end do
505            do j = k4+1, kk
506               t0 = temp(j)
507               j0 = ind(j)
508               iy = ky
509               do i = 1, m4, 4
510                  dest( iy ) = dest(iy) +t0*s1(i,j0)
511                  dest( iy+1) = dest(iy+1) +t0*s1(i+1,j0)
512                  dest( iy+2) = dest(iy+2)+t0*s1(i+2,j0)
513                  dest( iy+3) = dest(iy+3)+t0*s1(i+3,j0)
514                  iy = iy + 4
515              end do
516           end do
517           do j = k4+1, kk
518              t0 = temp(j)
519              j0 = ind(j)
520              iy = ky + m4
521              do i = m4 + 1, n_extent
522                dest(iy) = dest(iy) + t0 * s1(i, j0)
523                iy = iy + 1
524              end do
525           end do
526
527         end do
528   else
529         do k = 1, n_extent
530            dest(1+(k-1)*dlstride) = 0.0d0
531         enddo
532         mmod4 = mod(n_extent, 4)
533         mmod2 = mod(n_extent, 2)
534         m4 = n_extent - mmod4
535         m2 = n_extent- mmod2
536         kx = 1
537         ky = 1
538         incx = 1
539         jx = kx
540         do jj = 1, m_extent, bs
541            jx = kx + (jj-1)
542            kk = 0
543            do j = jj, min (m_extent, jj+bs-1)
544               if (s2(jx) .ne. zero) then
545                  kk = kk + 1
546                  temp(kk) = s2(jx)
547                  ind(kk) = j
548               end if
549               jx = jx + 1
550            end do
551            kmod4 = mod(kk, 4)
552            k4 = kk - kmod4
553            do j = 1, k4, 4
554               t0 = temp(j)
555               t1 = temp(j+1)
556               t2 = temp(j+2)
557               t3 = temp(j+3)
558               j0 = ind(j)
559               j1 = ind(j+1)
560               j2 = ind(j+2)
561               j3 = ind(j+3)
562               iy = ky
563               do i = 1, m4, 4
564                  dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0*s1(i, j0) &
565                           + t1*s1(i, j1) &
566                           + t2*s1(i, j2) &
567                           + t3*s1(i, j3)
568
569                  dest(1+(iy + 1 -1)*dlstride) = dest(1+(iy + 1 -1)*dlstride) + t0*s1(i+1, j0) &
570                               + t1*s1(i+1, j1) &
571                               + t2*s1(i+1, j2) &
572                               + t3*s1(i+1, j3)
573
574                  dest(1+(iy + 2 -1)*dlstride) = dest(1+(iy + 2 -1)*dlstride) + t0*s1(i+2, j0) &
575                               + t1*s1(i+2, j1) &
576                               + t2*s1(i+2, j2) &
577                               + t3*s1(i+2, j3)
578
579                  dest(1+(iy + 3 -1)*dlstride) = dest(1+(iy + 3 -1)*dlstride) + t0*s1(i+3, j0) &
580                               + t1*s1(i+3, j1) &
581                               + t2*s1(i+3, j2) &
582                               + t3*s1(i+3, j3)
583                  iy = iy + 4
584               end do
585               iy = ky + m4
586               do i = m4 + 1, n_extent
587                  dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0*s1(i, j0) &
588                           + t1*s1(i, j1) &
589                           + t2*s1(i, j2) &
590                           + t3*s1(i, j3)
591                  iy = iy + 1
592               end do
593            end do
594            do j = k4+1, kk
595               t0 = temp(j)
596               j0 = ind(j)
597               iy = ky
598               do i = 1, m4, 4
599                  dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0*s1(i,j0)
600                  dest(1+(iy + 1 -1)*dlstride) = dest(1+(iy + 1 -1)*dlstride) + t0*s1(i+1,j0)
601                  dest(1+(iy + 2 -1)*dlstride) = dest(1+(iy + 2 -1)*dlstride) + t0*s1(i+2,j0)
602                  dest(1+(iy + 3 -1)*dlstride) = dest(1+(iy + 3 -1)*dlstride) + t0*s1(i+3,j0)
603                  iy = iy + 4
604              end do
605           end do
606           do j = k4+1, kk
607              t0 = temp(j)
608              j0 = ind(j)
609              iy = ky + m4
610              do i = m4 + 1, n_extent
611                dest(1+(iy-1)*dlstride) = dest(1+(iy-1)*dlstride) + t0 * s1(i, j0)
612                iy = iy + 1
613              end do
614           end do
615         end do
616   endif
617end
618
619subroutine F90_matmul_int1_str1_vxm(dest, s1,s2,  &
620                   k_extent,m_extent, ld1,dlstride)
621
622   implicit none
623   DESC_INT  k_extent,m_extent,ld1,ld2,dlstride
624   INTEGER*1 s1(m_extent)
625   INTEGER*1 s2(ld1,k_extent)
626   INTEGER*1 dest(k_extent)
627
628   INTEGER   bs
629   parameter (bs = 384)
630   INTEGER*1 temp (bs)
631   INTEGER*1 t0,t1,t2,t3
632   INTEGER*1 t4,t5,t6,t7
633   INTEGER*1 dtemp0, dtemp1, dtemp2, dtemp3
634   INTEGER*1 dtemp4, dtemp5, dtemp6, dtemp7
635   DESC_INT  ind(bs),ind0,ind1,ind2,ind3
636   DESC_INT  ind4,ind5,ind6,ind7
637   INTEGER*1 zero
638   parameter (zero = 0)
639
640   DESC_INT mi,ki,ti
641   DESC_INT ii,jj,j,jx,kk,jnext
642   DESC_INT mmod8,m8,kmod8,k8,tmod8,tt8
643   DESC_INT mmod4,m4,kmod4,k4,tmod4,tt4
644
645   if (dlstride .eq. 1) then
646      do ki = 1, k_extent
647         dest(ki) = 0
648       end do
649
650      mmod8 = mod(m_extent,8)
651      m8 = m_extent - mmod8
652      kmod8 = mod(k_extent,8)
653      k8 = k_extent - kmod8
654
655      do kk = 1,k8,8
656         dtemp0 = dest(kk)
657         dtemp1 = dest(kk+1)
658         dtemp2 = dest(kk+2)
659         dtemp3 = dest(kk+3)
660         dtemp4 = dest(kk+4)
661         dtemp5 = dest(kk+5)
662         dtemp6 = dest(kk+6)
663         dtemp7 = dest(kk+7)
664         jnext = 1
665         do jj = 1,m8,bs
666            ! load s1 temp vector
667            ti = 0
668            jx = jj
669            do j = jj, min (m_extent, jj+bs-1)
670               if (s1(jx) .ne. zero) then
671                  ti = ti + 1
672                  temp(ti) = s1(jx)
673                  ind(ti) = j
674               end if
675               jx = jx + 1
676            end do
677
678            tmod8 = mod(ti,8)
679            tt8 = ti - tmod8
680
681            if (tt8 .ne. 0) then
682               jnext = ind(tt8)+1
683            endif
684
685            ti = 1
686            do ii = 1,tt8,8
687               t0 =  temp(ti)
688               t1 =  temp(ti+1)
689               t2 =  temp(ti+2)
690               t3 =  temp(ti+3)
691               t4 =  temp(ti+4)
692               t5 =  temp(ti+5)
693               t6 =  temp(ti+6)
694               t7 =  temp(ti+7)
695               ind0 = ind(ti)
696               ind1 = ind(ti+1)
697               ind2 = ind(ti+2)
698               ind3 = ind(ti+3)
699               ind4 = ind(ti+4)
700               ind5 = ind(ti+5)
701               ind6 = ind(ti+6)
702               ind7 = ind(ti+7)
703
704               dtemp0 = dtemp0 + &
705                          t0 * s2(ind0, kk) + t1 * s2(ind1, kk) + &
706                          t2 * s2(ind2, kk) + t3 * s2(ind3, kk) + &
707                          t4 * s2(ind4, kk) + t5 * s2(ind5, kk) + &
708                          t6 * s2(ind6, kk) + t7 * s2(ind7, kk)
709               dtemp1 = dtemp1 + &
710                          t0 * s2(ind0, kk+1) + t1 * s2(ind1, kk+1) + &
711                          t2 * s2(ind2, kk+1) + t3 * s2(ind3, kk+1) + &
712                          t4 * s2(ind4, kk+1) + t5 * s2(ind5, kk+1) + &
713                          t6 * s2(ind6, kk+1) + t7 * s2(ind7, kk+1)
714               dtemp2 = dtemp2 +  &
715                          t0 * s2(ind0, kk+2) + t1 * s2(ind1, kk+2) + &
716                          t2 * s2(ind2, kk+2) + t3 * s2(ind3, kk+2) + &
717                          t4 * s2(ind4, kk+2) + t5 * s2(ind5, kk+2) + &
718                          t6 * s2(ind6, kk+2) + t7 * s2(ind7, kk+2)
719               dtemp3 = dtemp3 +  &
720                          t0 * s2(ind0, kk+3) + t1 * s2(ind1, kk+3) + &
721                          t2 * s2(ind2, kk+3) + t3 * s2(ind3, kk+3) + &
722                          t4 * s2(ind4, kk+3) + t5 * s2(ind5, kk+3) + &
723                          t6 * s2(ind6, kk+3) + t7 * s2(ind7, kk+3)
724
725               dtemp4 = dtemp4 + &
726                          t0 * s2(ind0, kk+4) + t1 * s2(ind1, kk+4) + &
727                          t2 * s2(ind2, kk+4) + t3 * s2(ind3, kk+4) + &
728                          t4 * s2(ind4, kk+4) + t5 * s2(ind5, kk+4) + &
729                          t6 * s2(ind6, kk+4) + t7 * s2(ind7, kk+4)
730               dtemp5 = dtemp5 + &
731                          t0 * s2(ind0, kk+5) + t1 * s2(ind1, kk+5) + &
732                          t2 * s2(ind2, kk+5) + t3 * s2(ind3, kk+5) + &
733                          t4 * s2(ind4, kk+5) + t5 * s2(ind5, kk+5) + &
734                          t6 * s2(ind6, kk+5) + t7 * s2(ind7, kk+5)
735               dtemp6 = dtemp6 +  &
736                          t0 * s2(ind0, kk+6) + t1 * s2(ind1, kk+6) + &
737                          t2 * s2(ind2, kk+6) + t3 * s2(ind3, kk+6) + &
738                          t4 * s2(ind4, kk+6) + t5 * s2(ind5, kk+6) + &
739                          t6 * s2(ind6, kk+6) + t7 * s2(ind7, kk+6)
740               dtemp7 = dtemp7 +  &
741                          t0 * s2(ind0, kk+7) + t1 * s2(ind1, kk+7) + &
742                          t2 * s2(ind2, kk+7) + t3 * s2(ind3, kk+7) + &
743                          t4 * s2(ind4, kk+7) + t5 * s2(ind5, kk+7) + &
744                          t6 * s2(ind6, kk+7) + t7 * s2(ind7, kk+7)
745               ti = ti + 8
746            enddo
747         enddo
748         do ii = jnext,m_extent
749            t0 = s1(ii)
750            dtemp0 = dtemp0 + t0 * s2(ii,kk)
751            dtemp1 = dtemp1 + t0 * s2(ii,kk+1)
752            dtemp2 = dtemp2 + t0 * s2(ii,kk+2)
753            dtemp3 = dtemp3 + t0 * s2(ii,kk+3)
754            dtemp4 = dtemp4 + t0 * s2(ii,kk+4)
755            dtemp5 = dtemp5 + t0 * s2(ii,kk+5)
756            dtemp6 = dtemp6 + t0 * s2(ii,kk+6)
757            dtemp7 = dtemp7 + t0 * s2(ii,kk+7)
758         enddo
759         dest(kk) = dtemp0
760         dest(kk+1) = dtemp1
761         dest(kk+2) = dtemp2
762         dest(kk+3) = dtemp3
763         dest(kk+4) = dtemp4
764         dest(kk+5) = dtemp5
765         dest(kk+6) = dtemp6
766         dest(kk+7) = dtemp7
767      enddo
768      do kk = k8+1,k_extent
769         dtemp0 = dest(kk)
770         do ii = 1,m_extent
771            dtemp0 = dtemp0 + s1(ii) *  s2(ii,kk)
772          enddo
773         dest(kk) = dtemp0
774      enddo
775
776
777   else
778      do kk = 1, k_extent
779         dest(1+(kk-1)*dlstride) = 0
780      end do
781
782      mmod4 = mod(m_extent,4)
783      m4 = m_extent - mmod4
784      kmod4 = mod(k_extent,4)
785      k4 = k_extent - kmod4
786
787      do kk = 1,k4,4
788         dtemp0 = dest(1+(kk-1)*dlstride)
789         dtemp1 = dest(1+(kk)*dlstride)
790         dtemp2 = dest(1+(kk+1)*dlstride)
791         dtemp3 = dest(1+(kk+2)*dlstride)
792         jnext = 1
793         do jj = 1,m4,bs
794            ! load s1 temp vector
795            ti = 0
796            jx = jj
797            do j = jj, min (m_extent, jj+bs-1)
798               if (s1(jx) .ne. zero) then
799                  ti = ti + 1
800                  temp(ti) = s1(jx)
801                  ind(ti) = j
802               end if
803               jx = jx + 1
804            end do
805
806            tmod4 = mod(ti,4)
807            tt4 = ti - tmod4
808
809            if (tt4 .ne. 0) then
810               jnext = ind(tt4)+1
811            endif
812
813            ti = 1
814            do ii = 1,tt4,4
815               t0 =  temp(ti)
816               t1 =  temp(ti+1)
817               t2 =  temp(ti+2)
818               t3 =  temp(ti+3)
819               ind0 = ind(ti)
820               ind1 = ind(ti+1)
821               ind2 = ind(ti+2)
822               ind3 = ind(ti+3)
823
824               dtemp0 = dtemp0 + &
825                          t0 * s2(ind0, kk) + t1 * s2(ind1, kk) + &
826                          t2 * s2(ind2, kk) + t3 * s2(ind3, kk)
827               dtemp1 = dtemp1 + &
828                            t0 * s2(ind0, kk+1) + t1 * s2(ind1, kk+1) + &
829                            t2 * s2(ind2, kk+1) + t3 * s2(ind3, kk+1)
830               dtemp2 = dtemp2 +  &
831                            t0 * s2(ind0, kk+2) + t1 * s2(ind1, kk+2) + &
832                            t2 * s2(ind2, kk+2) + t3 * s2(ind3, kk+2)
833               dtemp3 = dtemp3 +  &
834                            t0 * s2(ind0, kk+3) + t1 * s2(ind1, kk+3) + &
835                            t2 * s2(ind2, kk+3) + t3 * s2(ind3, kk+3)
836
837               ti = ti + 4
838            enddo
839         enddo
840         do ii = jnext,m_extent
841            t0 = s1(ii)
842            dtemp0 = dtemp0 +  t0 * s2(ii,kk)
843            dtemp1 = dtemp1 + t0 * s2(ii,kk+1)
844            dtemp2 = dtemp2 + t0 * s2(ii,kk+2)
845            dtemp3 = dtemp3 +  t0 * s2(ii,kk+3)
846         enddo
847         dest(1+(kk-1)*dlstride) = dtemp0
848         dest(1+(kk)*dlstride) = dtemp1
849         dest(1+(kk+1)*dlstride) = dtemp2
850         dest(1+(kk+2)*dlstride) = dtemp3
851      enddo
852      do kk = k4+1,k_extent
853         dtemp0 = dest(1+(kk-1)*dlstride)
854         do ii = 1,m_extent
855            dtemp0 = dtemp0 + s1(ii) *  s2(ii,kk)
856          enddo
857         dest(1+(kk-1)*dlstride) = dtemp0
858      enddo
859   endif
860end
861
862