1! { dg-options "-O2" }
2! { dg-skip-if "AArch64 tiny code model does not support programs larger than 1MiB" {aarch64_tiny} }
3
4program strassen_matmul
5  use omp_lib
6  integer, parameter :: N = 1024
7  double precision, save :: A(N,N), B(N,N), C(N,N), D(N,N)
8  double precision :: start, end
9
10  call random_seed
11  call random_number (A)
12  call random_number (B)
13  start = omp_get_wtime ()
14  C = matmul (A, B)
15  end = omp_get_wtime ()
16  write(*,'(a, f10.6)') ' Time for matmul      = ', end - start
17  D = 0
18  start = omp_get_wtime ()
19  call strassen (A, B, D, N)
20  end = omp_get_wtime ()
21  write(*,'(a, f10.6)') ' Time for Strassen    = ', end - start
22  if (sqrt (sum ((C - D) ** 2)) / N .gt. 0.1) stop 1
23  D = 0
24  start = omp_get_wtime ()
25!$omp parallel
26!$omp single
27  call strassen (A, B, D, N)
28!$omp end single nowait
29!$omp end parallel
30  end = omp_get_wtime ()
31  write(*,'(a, f10.6)') ' Time for Strassen MP = ', end - start
32  if (sqrt (sum ((C - D) ** 2)) / N .gt. 0.1) stop 2
33
34contains
35
36  recursive subroutine strassen (A, B, C, N)
37    integer, intent(in) :: N
38    double precision, intent(in) :: A(N,N), B(N,N)
39    double precision, intent(out) :: C(N,N)
40    double precision :: T(N/2,N/2,7)
41    integer :: K, L
42
43    if (iand (N,1) .ne. 0 .or. N < 64) then
44      C = matmul (A, B)
45      return
46    end if
47    K = N / 2
48    L = N / 2 + 1
49!$omp task shared (A, B, T)
50    call strassen (A(:K,:K) + A(L:,L:), B(:K,:K) + B(L:,L:), T(:,:,1), K)
51!$omp end task
52!$omp task shared (A, B, T)
53    call strassen (A(L:,:K) + A(L:,L:), B(:K,:K), T(:,:,2), K)
54!$omp end task
55!$omp task shared (A, B, T)
56    call strassen (A(:K,:K), B(:K,L:) - B(L:,L:), T(:,:,3), K)
57!$omp end task
58!$omp task shared (A, B, T)
59    call strassen (A(L:,L:), B(L:,:K) - B(:K,:K), T(:,:,4), K)
60!$omp end task
61!$omp task shared (A, B, T)
62    call strassen (A(:K,:K) + A(:K,L:), B(L:,L:), T(:,:,5), K)
63!$omp end task
64!$omp task shared (A, B, T)
65    call strassen (A(L:,:K) - A(:K,:K), B(:K,:K) + B(:K,L:), T(:,:,6), K)
66!$omp end task
67!$omp task shared (A, B, T)
68    call strassen (A(:K,L:) - A(L:,L:), B(L:,:K) + B(L:,L:), T(:,:,7), K)
69!$omp end task
70!$omp taskwait
71    C(:K,:K) = T(:,:,1) + T(:,:,4) - T(:,:,5) + T(:,:,7)
72    C(L:,:K) = T(:,:,2) + T(:,:,4)
73    C(:K,L:) = T(:,:,3) + T(:,:,5)
74    C(L:,L:) = T(:,:,1) - T(:,:,2) + T(:,:,3) + T(:,:,6)
75  end subroutine strassen
76end
77