1; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
2; RUN: -debug < %s 2>&1| FileCheck %s
3; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
4; RUN: -polly-target-throughput-vector-fma=1 \
5; RUN: -polly-target-latency-vector-fma=8 \
6; RUN: -polly-target-1st-cache-level-size=32768 \
7; RUN: -polly-target-vector-register-bitwidth=256 \
8; RUN: -polly-target-2nd-cache-level-size=262144 -polly-ast \
9; RUN: -analyze < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
10; REQUIRES: asserts
11;
12;    C := A * B + C
13;    Check that the pattern matching optimizations can detect different
14;    permutations of GEMM loop and produce the correct ISL AST. In this case,
15;    dimensions of band nodes can be implicitly permuted by the algorithm
16;    applied during the schedule generation. It should be taken into the
17;    account during the pattern matching optimizations.
18;    for (i = 0; i < _PB_NI; i++)
19;      for (k = 0; k < _PB_NK; ++k)
20;        for (j = 0; j < _PB_NJ; j++)
21;	   C[i][j] += A[i][k] * B[k][j];
22;
23; CHECK: The matrix multiplication pattern was detected
24;
25; PATTERN-MATCHING-OPTS:    // 1st level tiling - Tiles
26; PATTERN-MATCHING-OPTS-NEXT:    for (int c1 = 0; c1 <= 3; c1 += 1) {
27; PATTERN-MATCHING-OPTS-NEXT:      for (int c3 = 256 * c1; c3 <= 256 * c1 + 255; c3 += 1)
28; PATTERN-MATCHING-OPTS-NEXT:        for (int c4 = 0; c4 <= 1023; c4 += 1)
29; PATTERN-MATCHING-OPTS-NEXT:          CopyStmt_0(0, c3, c4);
30; PATTERN-MATCHING-OPTS-NEXT:      for (int c2 = 0; c2 <= 10; c2 += 1) {
31; PATTERN-MATCHING-OPTS-NEXT:        for (int c3 = 96 * c2; c3 <= min(1023, 96 * c2 + 95); c3 += 1)
32; PATTERN-MATCHING-OPTS-NEXT:          for (int c4 = 256 * c1; c4 <= 256 * c1 + 255; c4 += 1)
33; PATTERN-MATCHING-OPTS-NEXT:            CopyStmt_1(c3, c4, 0);
34; PATTERN-MATCHING-OPTS-NEXT:        // 1st level tiling - Points
35; PATTERN-MATCHING-OPTS-NEXT:        // Register tiling - Tiles
36; PATTERN-MATCHING-OPTS-NEXT:        for (int c3 = 0; c3 <= 127; c3 += 1)
37; PATTERN-MATCHING-OPTS-NEXT:          for (int c4 = 0; c4 <= min(23, -24 * c2 + 255); c4 += 1)
38; PATTERN-MATCHING-OPTS-NEXT:            for (int c5 = 0; c5 <= 255; c5 += 1) {
39; PATTERN-MATCHING-OPTS-NEXT:              // Loop Vectorizer Disabled
40; PATTERN-MATCHING-OPTS-NEXT:              // Register tiling - Points
41; PATTERN-MATCHING-OPTS-NEXT:              {
42; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3);
43; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 1);
44; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 2);
45; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 3);
46; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 4);
47; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 5);
48; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 6);
49; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4, 256 * c1 + c5, 8 * c3 + 7);
50; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3);
51; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 1);
52; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 2);
53; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 3);
54; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 4);
55; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 5);
56; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 6);
57; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 1, 256 * c1 + c5, 8 * c3 + 7);
58; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3);
59; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 1);
60; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 2);
61; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 3);
62; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 4);
63; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 5);
64; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 6);
65; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 2, 256 * c1 + c5, 8 * c3 + 7);
66; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3);
67; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 1);
68; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 2);
69; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 3);
70; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 4);
71; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 5);
72; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 6);
73; PATTERN-MATCHING-OPTS-NEXT:                Stmt_for_body6(96 * c2 + 4 * c4 + 3, 256 * c1 + c5, 8 * c3 + 7);
74; PATTERN-MATCHING-OPTS-NEXT:              }
75; PATTERN-MATCHING-OPTS-NEXT:            }
76; PATTERN-MATCHING-OPTS-NEXT:      }
77; PATTERN-MATCHING-OPTS-NEXT:    }
78;
79target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
80target triple = "x86_64-unknown-unknown"
81
82define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) {
83entry:
84  br label %entry.split
85
86entry.split:                                      ; preds = %entry
87  br label %for.cond1.preheader
88
89for.cond1.preheader:                              ; preds = %for.inc20, %entry.split
90  %indvars.iv41 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next42, %for.inc20 ]
91  br label %for.cond4.preheader
92
93for.cond4.preheader:                              ; preds = %for.inc17, %for.cond1.preheader
94  %indvars.iv38 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next39, %for.inc17 ]
95  br label %for.body6
96
97for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
98  %indvars.iv = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next, %for.body6 ]
99  %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv41, i64 %indvars.iv38
100  %tmp = load double, double* %arrayidx8, align 8
101  %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv38, i64 %indvars.iv
102  %tmp1 = load double, double* %arrayidx12, align 8
103  %mul = fmul double %tmp, %tmp1
104  %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv41, i64 %indvars.iv
105  %tmp2 = load double, double* %arrayidx16, align 8
106  %add = fadd double %tmp2, %mul
107  store double %add, double* %arrayidx16, align 8
108  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
109  %exitcond = icmp ne i64 %indvars.iv.next, 1024
110  br i1 %exitcond, label %for.body6, label %for.inc17
111
112for.inc17:                                        ; preds = %for.body6
113  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
114  %exitcond40 = icmp ne i64 %indvars.iv.next39, 1024
115  br i1 %exitcond40, label %for.cond4.preheader, label %for.inc20
116
117for.inc20:                                        ; preds = %for.inc17
118  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
119  %exitcond43 = icmp ne i64 %indvars.iv.next42, 1024
120  br i1 %exitcond43, label %for.cond1.preheader, label %for.end22
121
122for.end22:                                        ; preds = %for.inc20
123  ret void
124}
125