1// RUN: mlir-opt %s -linalg-fuse-elementwise-ops -split-input-file | FileCheck %s
2
3// CHECK-DAG: [[$MAP0:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0, d1)>
4#map0 = affine_map<(d0, d1) -> (d0, d1)>
5
6// CHECK-LABEL: @add_mul_fusion
7func @add_mul_fusion(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
8{
9  %c0 = constant 0 : index
10  %c1 = constant 1 : index
11  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
12  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
13  %2 = linalg.init_tensor [%0, %1] : tensor<?x?xf32>
14  %3 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
15      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
16      outs(%2 : tensor<?x?xf32>) {
17    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):       // no predecessors
18      %4 = addf %arg3, %arg4 : f32
19      linalg.yield %4 : f32
20  } -> tensor<?x?xf32>
21  // CHECK: linalg.generic {
22  // CHECK-SAME: indexing_maps = {{\[}}[[$MAP0]], [[$MAP0]], [[$MAP0]], [[$MAP0]]{{\]}}
23  %4 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
24      ins(%3, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
25      outs(%2 : tensor<?x?xf32>) {
26    // CHECK: ^{{[a-zA-Z0-9_]*}}
27    // CHECK-SAME: [[ARG0:%[a-zA-Z0-9_]*]]
28    // CHECK-SAME: [[ARG1:%[a-zA-Z0-9_]*]]
29    // CHECK-SAME: [[ARG2:%[a-zA-Z0-9_]*]]
30    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):       // no predecessors
31      // CHECK: [[T1:%[a-zA-Z0-9_]*]] = addf [[ARG0]], [[ARG1]]
32      // CHECK-NOT: linalg.yield
33      // CHECK: mulf [[T1]], [[ARG2]]
34      // CHECK: linalg.yield
35      %5 = mulf %arg5, %arg6 : f32
36      linalg.yield %5 : f32
37    } -> tensor<?x?xf32>
38  return %4 : tensor<?x?xf32>
39}
40
41// -----
42
43// CHECK-DAG: [[$MAP0:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0, d1)>
44// CHECK-DAG: [[$MAP1:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> ()>
45#map0 = affine_map<(d0, d1) -> (d0, d1)>
46#map1 = affine_map<(d0, d1) -> ()>
47
48// CHECK-LABEL: @scalar_add_mul_fusion
49func @scalar_add_mul_fusion(%arg0: tensor<?x?xf32>, %arg1 : f32, %arg2 : f32) -> tensor<?x?xf32>
50{
51  %c0 = constant 0 : index
52  %c1 = constant 1 : index
53  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
54  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
55  %2 = linalg.init_tensor [%0, %1] : tensor<?x?xf32>
56  %3 = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel"]}
57      ins(%arg0, %arg1 : tensor<?x?xf32>, f32)
58      outs(%2 : tensor<?x?xf32>) {
59    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):       // no predecessors
60      %4 = addf %arg3, %arg4 : f32
61      linalg.yield %4 : f32
62  } -> tensor<?x?xf32>
63  // CHECK: linalg.generic {
64  // CHECK-SAME: indexing_maps = {{\[}}[[$MAP0]], [[$MAP1]], [[$MAP1]], [[$MAP0]]{{\]}}
65  %4 = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel"]}
66      ins(%3, %arg2 : tensor<?x?xf32>, f32)
67      outs(%2 : tensor<?x?xf32>) {
68    // CHECK: ^{{[a-zA-Z0-9_]*}}
69    // CHECK-SAME: [[ARG3:%[a-zA-Z0-9_]*]]
70    // CHECK-SAME: [[ARG4:%[a-zA-Z0-9_]*]]
71    // CHECK-SAME: [[ARG5:%[a-zA-Z0-9_]*]]
72    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):       // no predecessors
73      // CHECK: [[T1:%[a-zA-Z0-9_]*]] = addf [[ARG3]], [[ARG4]]
74      // CHECK-NOT: linalg.yield
75      // CHECK: mulf [[T1]], [[ARG5]]
76      // CHECK: linalg.yield
77      %5 = mulf %arg5, %arg6 : f32
78      linalg.yield %5 : f32
79    } -> tensor<?x?xf32>
80  return %4 : tensor<?x?xf32>
81}
82
83// -----
84
85// CHECK-DAG: [[$MAP0:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0, d1)>
86// CHECK-DAG: [[$MAP1:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d1, d0)>
87#map0 = affine_map<(d0, d1) -> (d0, d1)>
88#map1 = affine_map<(d0, d1) -> (d1, d0)>
89
90// CHECK-LABEL: @transpose_add_mul_fusion
91func @transpose_add_mul_fusion(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
92{
93  %c0 = constant 0 : index
94  %c1 = constant 1 : index
95  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
96  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
97  %2 = linalg.init_tensor [%0, %1] : tensor<?x?xf32>
98  %3 = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel"]}
99      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
100      outs(%2 : tensor<?x?xf32>) {
101    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):       // no predecessors
102      %4 = addf %arg3, %arg4 : f32
103      linalg.yield %4 : f32
104  } -> tensor<?x?xf32>
105  // CHECK: linalg.generic {
106  // CHECK-SAME: indexing_maps = {{\[}}[[$MAP0]], [[$MAP1]], [[$MAP0]], [[$MAP0]]{{\]}}
107  %4 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
108      ins(%3, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
109      outs(%2 : tensor<?x?xf32>) {
110    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):       // no predecessors
111      %5 = mulf %arg5, %arg6 : f32
112      linalg.yield %5 : f32
113    } -> tensor<?x?xf32>
114  return %4 : tensor<?x?xf32>
115}
116
117// -----
118
119// CHECK-DAG: [[$MAP0:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0, d1)>
120// CHECK-DAG: [[$MAP1:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d1, d0)>
121#map0 = affine_map<(d0, d1) -> (d0, d1)>
122#map1 = affine_map<(d0, d1) -> (d1, d0)>
123
124// CHECK-LABEL: @add_transpose_mul_fusion
125func @add_transpose_mul_fusion(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
126{
127  %c0 = constant 0 : index
128  %c1 = constant 1 : index
129  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
130  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
131  %2 = linalg.init_tensor [%0, %1] : tensor<?x?xf32>
132  %3 = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel"]}
133      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
134      outs(%2 : tensor<?x?xf32>) {
135    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):       // no predecessors
136      %4 = addf %arg3, %arg4 : f32
137      linalg.yield %4 : f32
138  } -> tensor<?x?xf32>
139  // CHECK: linalg.generic {
140  // CHECK-SAME: indexing_maps = {{\[}}[[$MAP1]], [[$MAP0]], [[$MAP0]], [[$MAP0]]{{\]}}
141  %4 = linalg.generic {indexing_maps = [#map1, #map0, #map0], iterator_types = ["parallel", "parallel"]}
142      ins(%3, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
143      outs(%2 : tensor<?x?xf32>){
144    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):       // no predecessors
145      %5= mulf %arg5, %arg6 : f32
146      linalg.yield %5 : f32
147    } -> tensor<?x?xf32>
148  return %4 : tensor<?x?xf32>
149}
150
151// -----
152
153// CHECK-DAG: [[$MAP0:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0, d1)>
154// CHECK-DAG: [[$MAP1:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0)>
155#map0 = affine_map<(d0, d1) -> (d0, d1)>
156#map1 = affine_map<(d0, d1) -> (d0)>
157#map2 = affine_map<(d0) -> (d0)>
158
159// CHECK-LABEL: @add_broadcast_mul_fusion
160func @add_broadcast_mul_fusion(%arg0: tensor<?xf32>, %arg1 : tensor<?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
161{
162  %c0 = constant 0 : index
163  %c1 = constant 1 : index
164  %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
165  %1 = linalg.init_tensor [%0] : tensor<?xf32>
166  %2 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]}
167      ins(%arg0, %arg1 : tensor<?xf32>, tensor<?xf32>)
168      outs(%1 : tensor<?xf32>) {
169    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):       // no predecessors
170      %3 = addf %arg3, %arg4 : f32
171      linalg.yield %3 : f32
172  } -> tensor<?xf32>
173  // CHECK: linalg.generic {
174  // CHECK-SAME: indexing_maps = {{\[}}[[$MAP1]], [[$MAP1]], [[$MAP0]], [[$MAP0]]
175  %3 = tensor.dim %arg2, %c1 : tensor<?x?xf32>
176  %4 = linalg.init_tensor [%0, %3] : tensor<?x?xf32>
177  %5 = linalg.generic {indexing_maps = [#map1, #map0, #map0], iterator_types = ["parallel", "parallel"]}
178      ins(%2, %arg2 : tensor<?xf32>, tensor<?x?xf32>)
179      outs(%4 : tensor<?x?xf32>){
180    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):       // no predecessors
181      %6 = mulf %arg5, %arg6 : f32
182      linalg.yield %6 : f32
183    } -> tensor<?x?xf32>
184  return %5 : tensor<?x?xf32>
185}
186
187// -----
188
189// CHECK: #[[$MAP0:.*]] = affine_map<() -> ()>
190#map0 = affine_map<() -> ()>
191
192// CHECK-LABEL: @add_mul_scalar_fusion
193func @add_mul_scalar_fusion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32>
194{
195  %0 = linalg.init_tensor [] : tensor<f32>
196  %1 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = []}
197      ins(%arg0, %arg1 : tensor<f32>, tensor<f32>)
198      outs(%0 : tensor<f32>) {
199    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):       // no predecessors
200      %2 = addf %arg3, %arg4 : f32
201      linalg.yield %2 : f32
202  } -> tensor<f32>
203  // CHECK: linalg.generic {
204  // CHECK: addf
205  // CHECK: mulf
206  %2 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = []}
207      ins(%1, %arg2 : tensor<f32>, tensor<f32>)
208      outs(%0 : tensor<f32>) {
209    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):       // no predecessors
210      %3 = mulf %arg3, %arg4 : f32
211      linalg.yield %3 : f32
212  } -> tensor<f32>
213
214  return %2 : tensor<f32>
215}
216
217// -----
218
219#map0 = affine_map<(d0, d1, d2) -> (d0)>
220#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
221func @generic_op_constant_fusion(%arg0 : tensor<5x?x?xf32>) -> tensor<5x?x?xf32>
222{
223  %c0 = constant 0 : index
224  %c1 = constant 1 : index
225  %c2 = constant 2 : index
226  %cst = constant dense<42.0> : tensor<5xf32>
227  %0 = tensor.dim %arg0, %c1 : tensor<5x?x?xf32>
228  %1 = tensor.dim %arg0, %c2 : tensor<5x?x?xf32>
229  %2 = linalg.init_tensor [5, %0, %1] : tensor<5x?x?xf32>
230  %3 = linalg.generic {
231    indexing_maps = [#map0, #map1, #map1],
232    iterator_types = ["parallel", "parallel", "parallel"]}
233    ins(%cst, %arg0 : tensor<5xf32>, tensor<5x?x?xf32>)
234    outs(%2 : tensor<5x?x?xf32>) {
235    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
236      %4 = mulf %arg1, %arg2 : f32
237      linalg.yield %4 : f32
238    } -> tensor<5x?x?xf32>
239  return %3 : tensor<5x?x?xf32>
240}
241//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
242// CHECK-LABEL: func @generic_op_constant_fusion
243//       CHECK:   %[[CST:.*]] = constant {{.*}} : f32
244//       CHECK:   linalg.generic
245//       CHECK:   ^{{.+}}(%[[ARG1:[a-zA-Z0-9_]+]]: f32, %{{.+}}: f32):
246//       CHECK:     mulf %[[CST]], %[[ARG1]]
247
248// -----
249
250#map0 = affine_map<(d0, d1, d2) -> ()>
251#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
252func @generic_op_zero_dim_constant_fusion(%arg0 : tensor<5x?x?xf32>)
253  -> tensor<5x?x?xf32>
254{
255  %c0 = constant 0 : index
256  %c1 = constant 1 : index
257  %c2 = constant 2 : index
258  %cst = constant dense<42.0> : tensor<f32>
259  %0 = tensor.dim %arg0, %c1 : tensor<5x?x?xf32>
260  %1 = tensor.dim %arg0, %c2 : tensor<5x?x?xf32>
261  %2 = linalg.init_tensor [5, %0, %1] : tensor<5x?x?xf32>
262  %3 = linalg.generic {
263    indexing_maps = [#map0, #map1, #map1],
264    iterator_types = ["parallel", "parallel", "parallel"]}
265    ins(%cst, %arg0 : tensor<f32>, tensor<5x?x?xf32>)
266    outs(%2 : tensor<5x?x?xf32>) {
267    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
268      %4 = mulf %arg1, %arg2 : f32
269      linalg.yield %4 : f32
270    } -> tensor<5x?x?xf32>
271  return %3 : tensor<5x?x?xf32>
272}
273//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
274// CHECK-LABEL: func @generic_op_zero_dim_constant_fusion
275//       CHECK:   %[[CST:.*]] = constant {{.*}} : f32
276//       CHECK:   linalg.generic
277//       CHECK:   ^{{.*}}(%[[ARG1:[a-zA-Z0-9_]*]]: f32, %{{.*}}: f32)
278//       CHECK:     mulf %[[CST]], %[[ARG1]]
279
280// -----
281
282#map0 = affine_map<(d0, d1) -> (d0, d1)>
283func @producer_indexed_consumer_fusion(%arg0: tensor<?x?xi32>,
284                                       %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
285  %c0 = constant 0 : index
286  %c1 = constant 1 : index
287  %0 = tensor.dim %arg0, %c0 : tensor<?x?xi32>
288  %1 = tensor.dim %arg0, %c1 : tensor<?x?xi32>
289  %2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>
290  %3 = linalg.generic {
291    indexing_maps = [#map0, #map0, #map0],
292    iterator_types = ["parallel", "parallel"] }
293    ins(%arg0, %arg1  : tensor<?x?xi32>, tensor<?x?xi32>)
294    outs(%2 : tensor<?x?xi32>) {
295    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):       // no predecessors
296      %10 = addi %arg2, %arg3 : i32
297      linalg.yield %10 : i32
298    } -> tensor<?x?xi32>
299  %4 = linalg.generic {
300    indexing_maps = [#map0, #map0],
301    iterator_types = ["parallel", "parallel"] }
302    ins(%3 : tensor<?x?xi32>)
303    outs(%2 : tensor<?x?xi32>) {
304    ^bb0(%arg2: i32, %arg3: i32):       // no predecessors
305      %idx0 = linalg.index 0 : index
306      %idx1 = linalg.index 1 : index
307      %5 = index_cast %idx0 : index to i32
308      %6 = index_cast %idx1 : index to i32
309      %7 = addi %arg2, %5 : i32
310      %8 = subi %7, %6 : i32
311      linalg.yield %8 : i32
312    } -> tensor<?x?xi32>
313  return %4 : tensor<?x?xi32>
314}
315//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
316// CHECK-LABEL: func @producer_indexed_consumer_fusion
317//      CHECK: linalg.generic
318// CHECK-SAME:    indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]]]
319//      CHECK: ^{{[a-zA-Z0-9_]*}}
320// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: i32
321// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: i32
322//      CHECK:   %[[VAL1:.+]] = addi %[[ARG0]], %[[ARG1]] : i32
323//      CHECK:   %[[IDX0:.+]] = linalg.index 0 : index
324//      CHECK:   %[[IDX1:.+]] = linalg.index 1 : index
325//      CHECK:   %[[ADD_OPERAND:.+]] = index_cast %[[IDX0]] : index to i32
326//      CHECK:   %[[SUB_OPERAND:.+]] = index_cast %[[IDX1]] : index to i32
327//      CHECK:   %[[VAL2:.+]] = addi %[[VAL1]], %[[ADD_OPERAND]] : i32
328//      CHECK:   %[[VAL3:.+]] = subi %[[VAL2]], %[[SUB_OPERAND]] : i32
329//      CHECK:   linalg.yield %[[VAL3]] : i32
330//  CHECK-NOT: linalg.generic
331
332// -----
333
334#map0 = affine_map<(d0, d1) -> (d0, d1)>
335func @indexed_producer_consumer_fusion(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
336  %c0 = constant 0 : index
337  %c1 = constant 1 : index
338  %0 = tensor.dim %arg0, %c0 : tensor<?x?xi32>
339  %1 = tensor.dim %arg0, %c1 : tensor<?x?xi32>
340  %2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>
341  %3 = linalg.generic {
342    indexing_maps = [#map0, #map0],
343    iterator_types = ["parallel", "parallel"] }
344    ins(%arg0 : tensor<?x?xi32>)
345    outs(%2 : tensor<?x?xi32>) {
346    ^bb0(%arg4: i32, %arg5: i32):       // no predecessors
347      %idx0 = linalg.index 0 : index
348      %idx1 = linalg.index 1 : index
349      %4 = index_cast %idx0 : index to i32
350      %5 = index_cast %idx1 : index to i32
351      %6 = addi %arg4, %4 : i32
352      %7 = subi %6, %5 : i32
353      linalg.yield %7 : i32
354    } -> tensor<?x?xi32>
355  %4 = linalg.generic {
356    indexing_maps = [#map0, #map0, #map0],
357    iterator_types = ["parallel", "parallel"] }
358    ins(%3, %arg0 : tensor<?x?xi32>, tensor<?x?xi32>)
359    outs(%2 : tensor<?x?xi32>) {
360    ^bb0(%arg2: i32, %arg3: i32, %arg4: i32):       // no predecessors
361      %10 = addi %arg2, %arg3 : i32
362      linalg.yield %10 : i32
363    } -> tensor<?x?xi32>
364  return %4 : tensor<?x?xi32>
365}
366//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
367// CHECK-LABEL: func @indexed_producer_consumer_fusion
368//       CHECK: linalg.generic
369// CHECK-SAME:    indexing_maps = [#[[$MAP0]], #[[$MAP0]]]
370//      CHECK: ^{{[a-zA-Z0-9_]*}}
371// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: i32
372// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: i32
373//      CHECK:   %[[IDX0:.+]] = linalg.index 0 : index
374//      CHECK:   %[[IDX1:.+]] = linalg.index 1 : index
375//      CHECK:   %[[ADD_OPERAND:.+]] = index_cast %[[IDX0]] : index to i32
376//      CHECK:   %[[SUB_OPERAND:.+]] = index_cast %[[IDX1]] : index to i32
377//      CHECK:   %[[VAL1:.+]] = addi %[[ARG0]], %[[ADD_OPERAND]] : i32
378//      CHECK:   %[[VAL2:.+]] = subi %[[VAL1]], %[[SUB_OPERAND]] : i32
379//      CHECK:   %[[VAL3:.+]] = addi %[[VAL2]], %[[ARG0]] : i32
380//      CHECK:   linalg.yield %[[VAL3]] : i32
381//   CHECK-NOT: linalg.generic
382
383// -----
384
385// The indices of the first generic op are swapped after fusion.
386#map0 = affine_map<(d0, d1) -> (d1, d0)>
387#map1 = affine_map<(d0, d1) -> (d0, d1)>
388func @indexed_producer_indexed_consumer_fusion(%arg0: tensor<?x?xi32>)
389                                               -> tensor<?x?xi32> {
390  %c0 = constant 0 : index
391  %c1 = constant 1 : index
392  %0 = tensor.dim %arg0, %c0 : tensor<?x?xi32>
393  %1 = tensor.dim %arg0, %c1 : tensor<?x?xi32>
394  %2 = linalg.init_tensor [%0, %1] : tensor<?x?xi32>
395  %3 = linalg.generic {
396    indexing_maps = [#map0, #map0],
397    iterator_types = ["parallel", "parallel"] }
398    ins(%arg0 : tensor<?x?xi32>)
399    outs(%2 : tensor<?x?xi32>) {
400    ^bb0(%arg2: i32, %arg3: i32):       // no predecessors
401      %idx0 = linalg.index 0 : index
402      %idx1 = linalg.index 1 : index
403      %4 = index_cast %idx0 : index to i32
404      %5 = index_cast %idx1 : index to i32
405      %6 = addi %arg2, %4 : i32
406      %7 = subi %5, %6 : i32
407      linalg.yield %7 : i32
408    } -> tensor<?x?xi32>
409  %4= linalg.generic {
410    indexing_maps = [#map1, #map1],
411    iterator_types = ["parallel", "parallel"] }
412    ins(%3 : tensor<?x?xi32>)
413    outs(%2 : tensor<?x?xi32>) {
414    ^bb0(%arg2: i32, %arg3: i32):       // no predecessors
415      %idx0 = linalg.index 0 : index
416      %idx1 = linalg.index 1 : index
417      %5 = index_cast %idx0 : index to i32
418      %6 = index_cast %idx1 : index to i32
419      %7 = addi %arg2, %5 : i32
420      %8 = subi %7, %6 : i32
421      linalg.yield %8 : i32
422    } -> tensor<?x?xi32>
423  return %4 : tensor<?x?xi32>
424}
425//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
426// CHECK-LABEL: func @indexed_producer_indexed_consumer_fusion
427//       CHECK: linalg.generic
428// CHECK-SAME:    indexing_maps = [#[[$MAP0]], #[[$MAP0]]]
429//      CHECK: ^{{[a-zA-Z0-9_]*}}
430// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: i32
431//      CHECK:   %[[IDX0:.+]] = linalg.index 0 : index
432//      CHECK:   %[[IDX1:.+]] = linalg.index 1 : index
433//      CHECK:   %[[ADD_OPERAND1:.+]] = index_cast %[[IDX1]] : index to i32
434//      CHECK:   %[[SUB_OPERAND1:.+]] = index_cast %[[IDX0]] : index to i32
435//      CHECK:   %[[VAL1:.+]] = addi %[[ARG0]], %[[ADD_OPERAND1]] : i32
436//      CHECK:   %[[VAL2:.+]] = subi %[[SUB_OPERAND1]], %[[VAL1]] : i32
437//      CHECK:   %[[IDX2:.+]] = linalg.index 0 : index
438//      CHECK:   %[[IDX3:.+]] = linalg.index 1 : index
439//      CHECK:   %[[ADD_OPERAND2:.+]] = index_cast %[[IDX2]] : index to i32
440//      CHECK:   %[[SUB_OPERAND2:.+]] = index_cast %[[IDX3]] : index to i32
441//      CHECK:   %[[VAL3:.+]] = addi %[[VAL2]], %[[ADD_OPERAND2]] : i32
442//      CHECK:   %[[VAL4:.+]] = subi %[[VAL3]], %[[SUB_OPERAND2]] : i32
443//      CHECK:   linalg.yield %[[VAL4]] : i32
444//   CHECK-NOT: linalg.generic
445
446// -----
447
448#map1 = affine_map<(d0) -> (d0)>
449#map2 = affine_map<(d0, d1) -> (d0, d1)>
450#map3 = affine_map<(d0, d1) -> (d1)>
451func @one_dim_indexed_producer_consumer_fusion(%arg0 : tensor<?xi32>,
452                                               %arg1 : tensor<?x?xi32>) -> tensor<?x?xi32> {
453  %c0 = constant 0 : index
454  %c1 = constant 1 : index
455  %d0 = tensor.dim %arg0, %c0 : tensor<?xi32>
456  %0 = linalg.init_tensor [%d0] : tensor<?xi32>
457  %1 = linalg.generic
458      {indexing_maps = [#map1, #map1],
459       iterator_types = ["parallel"]}
460      ins(%arg0 : tensor<?xi32>) outs(%0 : tensor<?xi32>) {
461      ^bb0(%arg2 : i32, %arg3 : i32):
462        %2 = linalg.index 0 : index
463        %3 = index_cast %2 : index to i32
464        %4 = addi %arg2, %3 : i32
465        linalg.yield %4 : i32
466      } -> tensor<?xi32>
467  %2 = tensor.dim %arg1, %c0 : tensor<?x?xi32>
468  %3 = tensor.dim %arg1, %c1 : tensor<?x?xi32>
469  %4 = linalg.init_tensor [%2, %3] : tensor<?x?xi32>
470  %5 = linalg.generic
471      {indexing_maps = [#map2, #map3, #map2],
472       iterator_types = ["parallel", "parallel"]}
473      ins(%arg1, %1 : tensor<?x?xi32>, tensor<?xi32>)
474      outs(%4 : tensor<?x?xi32>) {
475      ^bb0(%arg2 : i32, %arg3 : i32, %arg4: i32):
476        %6 = addi %arg2, %arg3 : i32
477        linalg.yield %6 : i32
478     } -> tensor<?x?xi32>
479  return %5 : tensor<?x?xi32>
480}
481//   CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)>
482//   CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
483// CHECK-LABEL: func @one_dim_indexed_producer_consumer_fusion
484//       CHECK: linalg.generic
485// CHECK-SAME:    indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP0]]]
486//      CHECK: ^{{[a-zA-Z0-9_]*}}
487// CHECK-SAME: (%[[ARG0:[a-zA-Z0-9_]*]]: i32, %[[ARG1:[a-zA-Z0-9_]*]]: i32
488//      CHECK:   %[[IDX1:.+]] = linalg.index 1 : index
489//      CHECK:   %[[VAL1:.+]] = index_cast %[[IDX1]] : index to i32
490//      CHECK:   %[[VAL2:.+]] = addi %[[ARG1]], %[[VAL1]] : i32
491//      CHECK:   %[[VAL3:.+]] = addi %[[ARG0]], %[[VAL2]] : i32
492//      CHECK:   linalg.yield %[[VAL3]] : i32
493//   CHECK-NOT: linalg.generic
494
495// -----
496
497func @scalar_generic_fusion
498  (%arg0: tensor<5x1x1xf32>, %arg1 : tensor<i32>) -> tensor<10xf32>
499{
500  %c0 = constant 0 : index
501  %cst = constant dense<1.000000e+00> : tensor<10xf32>
502  %0 = linalg.init_tensor [] : tensor<f32>
503  %1 = linalg.generic
504    {indexing_maps = [affine_map<() -> ()>, affine_map<() -> ()>],
505     iterator_types = []}
506    ins(%arg1 : tensor<i32>) outs(%0 : tensor<f32>) {
507    ^bb0(%arg2: i32, %arg3: f32):  // no predecessors
508      %3 = index_cast %arg2 : i32 to index
509      %4 = tensor.extract %arg0[%3, %c0, %c0] : tensor<5x1x1xf32>
510      linalg.yield %4 : f32
511    } -> tensor<f32>
512  %2 = linalg.init_tensor [10] : tensor<10xf32>
513  %3 = linalg.generic
514   {indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>,
515                     affine_map<(d0) -> (d0)>],
516    iterator_types = ["parallel"]}
517    ins(%1, %cst : tensor<f32>, tensor<10xf32>) outs(%2 : tensor<10xf32>) {
518    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
519      %4 = mulf %arg2, %arg3 : f32
520      linalg.yield %4 : f32
521    } -> tensor<10xf32>
522  return %3 : tensor<10xf32>
523}
524//   CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> ()>
525//   CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0)>
526//       CHECK: func @scalar_generic_fusion
527//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<5x1x1xf32>
528//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: tensor<i32>
529//       CHECK:   %[[T0:.+]] = linalg.generic
530//  CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]]]
531//  CHECK-SAME:     iterator_types = ["parallel"]
532//  CHECK-SAME:     ins(%[[ARG1]] : tensor<i32>)
533//       CHECK:     tensor.extract %[[ARG0]]
534//       CHECK:     linalg.yield
535//       CHECK   return %[[T0]]
536
537// -----
538
539func @constant_fusion(%arg0 : tensor<4xf32>) -> (tensor<4xf32>) {
540  %cst = constant dense<1.0> : tensor<4xf32>
541  %1 = linalg.init_tensor [4] : tensor<4xf32>
542  %2 = linalg.generic
543    {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>,
544                      affine_map<(d0) -> (d0)>],
545     iterator_types = ["parallel"]}
546    ins (%arg0, %cst : tensor<4xf32>, tensor<4xf32>)
547    outs (%1 : tensor<4xf32>) {
548    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
549      %3 = addf %arg1, %arg2 : f32
550      linalg.yield %3 : f32
551    } -> tensor<4xf32>
552  return %2 : tensor<4xf32>
553}
554
555//  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> (d0)>
556//      CHECK: func @constant_fusion(%[[ARG0:.+]]: tensor<4xf32>)
557//  CHECK-DAG:   %[[CST:.+]] = constant 1.000000e+00 : f32
558//  CHECK-DAG:   %[[T0:.+]] = linalg.init_tensor [4] : tensor<4xf32>
559//      CHECK:   %[[T1:.+]] = linalg.generic
560// CHECK-SAME:     indexing_maps = [#[[MAP]], #[[MAP]]]
561// CHECK-SAME:     ins(%[[ARG0]] : tensor<4xf32>)
562// CHECK-SAME:     outs(%[[T0]] : tensor<4xf32>)
563//      CHECK:   ^{{.+}}(
564// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: f32, %[[ARG2:[a-zA-Z0-9_]+]]: f32)
565//      CHECK:     %[[T2:.+]] = addf %[[ARG1]], %[[CST]]
566//      CHECK:     linalg.yield %[[T2]]
567//      CHECK:   return %[[T1]]
568
569// -----
570
571#map0 = affine_map<(d0, d1) -> (d0, d1)>
572#map1 = affine_map<(d0) -> (0, d0)>
573#map2 = affine_map<(d0) -> (0)>
574func @consumer_with_reduction(%arg0: tensor<1x10xf32>,
575                              %arg1: tensor<1x10xf32>,
576                              %arg2: tensor<1xf32>) -> tensor<1xf32> {
577  %init = linalg.init_tensor [1, 10] : tensor<1x10xf32>
578  %0 = linalg.generic
579    {indexing_maps = [#map0, #map0, #map0],
580     iterator_types = ["parallel", "parallel"]}
581    ins(%arg0, %arg1 : tensor<1x10xf32>, tensor<1x10xf32>)
582    outs(%init : tensor<1x10xf32>) {
583  ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):  // no predecessors
584    %2 = addf %arg3, %arg4 : f32
585    linalg.yield %2 : f32
586  } -> tensor<1x10xf32>
587  %1 = linalg.generic
588    {indexing_maps = [#map1, #map2],
589     iterator_types = ["reduction"]}
590    ins(%0 : tensor<1x10xf32>)
591    outs(%arg2 : tensor<1xf32>)  {
592  ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
593    %2 = addf %arg3, %arg4 : f32
594    linalg.yield %2 : f32
595  } -> tensor<1xf32>
596  return %1 : tensor<1xf32>
597}
598//  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (0, d0)>
599//  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (0)>
600//      CHECK: func @consumer_with_reduction(%[[ARG0:.+]]: tensor<1x10xf32>, %[[ARG1:.+]]: tensor<1x10xf32>, %[[ARG2:.+]]: tensor<1xf32>)
601//      CHECK:   %[[RES:.+]] = linalg.generic
602// CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP0]], #[[MAP1]]]
603// CHECK-SAME:     iterator_types = ["reduction"]
604// CHECK-SAME:     ins(%[[ARG0]], %[[ARG1]] : tensor<1x10xf32>, tensor<1x10xf32>)
605//      CHECK:   ^{{.+}}(%[[T0:.+]]: f32, %[[T1:.+]]: f32, %[[T2:.+]]: f32)
606//      CHECK:     %[[T3:.+]] = addf %[[T0]], %[[T1]] : f32
607//      CHECK:     %[[T4:.+]] = addf %[[T3]], %[[T2]] : f32
608//      CHECK:     linalg.yield %[[T4]]
609//      CHECK:   return %[[RES]]
610
611// -----
612
613// CHECK-LABEL: func @sigmoid_dynamic_dim(
614//       CHECK:   %[[RES:.*]] = linalg.generic
615//   CHECK-NOT:   linalg.generic
616//       CHECK:   return %[[RES]]
617func @sigmoid_dynamic_dim(%0: tensor<?x1xf32>) -> tensor<?x1xf32> {
618  %cp5 = constant 5.000000e-01 : f32
619  %c0 = constant 0 : index
620  %shape = shape.shape_of %0 : tensor<?x1xf32> -> tensor<?xindex>
621  %extend = shape.to_extent_tensor %shape : tensor<?xindex> -> tensor<2xindex>
622  %extracted = tensor.extract %extend[%c0] : tensor<2xindex>
623  %init0 = linalg.init_tensor [%extracted, 1] : tensor<?x1xf32>
624  %1 = linalg.generic {indexing_maps = [
625    affine_map<(d0, d1) -> (d0, d1)>],
626    iterator_types = ["parallel", "parallel"]
627  }
628     outs(%init0 : tensor<?x1xf32>) {
629    ^bb0(%a: f32):  // no predecessors
630      linalg.yield %cp5 : f32
631  } -> tensor<?x1xf32>
632  %d0 = tensor.dim %0, %c0 : tensor<?x1xf32>
633  %init1 = linalg.init_tensor [%d0, 1] : tensor<?x1xf32>
634  %2 = linalg.generic {indexing_maps = [
635    affine_map<(d0, d1) -> (d0, d1)>,
636    affine_map<(d0, d1) -> (d0, d1)>,
637    affine_map<(d0, d1) -> (d0, d1)>],
638    iterator_types = ["parallel", "parallel"]
639  }
640      ins(%0, %1 : tensor<?x1xf32>, tensor<?x1xf32>)
641     outs(%init1 : tensor<?x1xf32>) {
642  ^bb0(%a: f32, %b: f32, %c: f32):  // no predecessors
643      %m = mulf %a, %b : f32
644      linalg.yield %m : f32
645  } -> tensor<?x1xf32>
646  return %2 : tensor<?x1xf32>
647}
648
649// -----
650
651func private @compute1(%a: f64) -> f64
652func private @compute2(%a: f64, %b: i32) -> i32
653
654// CHECK-LABEL: func @generic_index_op2(
655func @generic_index_op2(%arg0: tensor<1x8xf64>, %arg1: tensor<1x8xi32>) -> tensor<1x8xi32> {
656  %0 = linalg.generic {
657    indexing_maps = [affine_map<(i, j) -> (i, j)>],
658    iterator_types = ["parallel", "parallel"]}
659  outs(%arg0 : tensor<1x8xf64>) {
660  ^bb0(%a: f64):
661    %r = call @compute1(%a) : (f64) -> f64
662    linalg.yield %r : f64
663  } -> tensor<1x8xf64>
664
665  // CHECK-NEXT:   %[[R:.*]] = linalg.generic
666  //      CHECK:     bb0(%[[BBA:[0-9a-z]*]]: f64, %[[BBB:[0-9a-z]*]]: i32):
667  // CHECK-NEXT:       %[[A:.*]] = call @compute1(%[[BBA]]) : (f64) -> f64
668  // CHECK-NEXT:       %[[B:.*]] = call @compute2(%[[A]], %[[BBB]]) : (f64, i32) -> i32
669  // CHECK-NEXT:       linalg.yield %[[B]] : i32
670  // CHECK-NEXT:   } -> tensor<1x8xi32>
671  %1 = linalg.generic {
672    indexing_maps = [affine_map<(i, j) -> (i, j)>, affine_map<(i, j) -> (i, j)>],
673    iterator_types = ["parallel", "parallel"]}
674  ins(%0 : tensor<1x8xf64>)
675  outs(%arg1 : tensor<1x8xi32>) {
676  ^bb0(%a: f64, %b: i32):
677    %r = call @compute2(%a, %b) : (f64, i32) -> i32
678    linalg.yield %r : i32
679  } -> tensor<1x8xi32>
680
681  // CHECK-NEXT:   return %[[R]] : tensor<1x8xi32>
682  return %1 : tensor<1x8xi32>
683}
684
685// -----
686
687// CHECK-LABEL: func @no_fuse_constant_with_reduction
688func @no_fuse_constant_with_reduction() -> tensor<3xf32>
689{
690  //      CHECK: %[[CONST:.+]] = constant {{.+}} : tensor<3x2xf32>
691  //      CHECK: %[[RESULT:.+]] = linalg.generic
692  // CHECK-SAME:   ins(%[[CONST]] : tensor<3x2xf32>)
693  //      CHECK: return %[[RESULT]]
694  %three = constant dense<3.0> : tensor<3x2xf32>
695  %init = linalg.init_tensor [3] : tensor<3xf32>
696  %result = linalg.generic {
697      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
698                       affine_map<(d0, d1) -> (d0)>],
699      iterator_types = ["parallel", "reduction"]}
700     ins(%three : tensor<3x2xf32>) outs(%init : tensor<3xf32>) {
701     ^bb0(%arg0 : f32, %arg1 : f32):
702        %0 = addf %arg0, %arg1 : f32
703        linalg.yield %0 : f32
704  } -> tensor<3xf32>
705  return %result : tensor<3xf32>
706}
707
708// -----
709
710#map = affine_map<(d0, d1) -> (d0, d1)>
711#trait = {
712  indexing_maps = [#map, #map],
713  iterator_types = ["parallel", "parallel"]
714}
715func @break_outs_dependency(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32>
716{
717  %0 = linalg.generic #trait ins(%arg0 : tensor<?x?xf32>) outs(%arg0 : tensor<?x?xf32>) {
718       ^bb0(%arg1 : f32, %arg2 : f32) :
719         %1 = addf %arg1, %arg1 : f32
720         linalg.yield %1 : f32
721       } -> tensor<?x?xf32>
722  %2 = linalg.generic #trait ins(%0 : tensor<?x?xf32>) outs(%0 : tensor<?x?xf32>) {
723       ^bb0(%arg1 : f32, %arg2 : f32) :
724         %3 = mulf %arg1, %arg1 : f32
725         linalg.yield %3 : f32
726       } -> tensor<?x?xf32>
727  return %2 : tensor<?x?xf32>
728}
729//      CHECK: func @break_outs_dependency(
730// CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xf32>)
731//  CHECK-DAG:   %[[C0:.+]] = constant 0 : index
732//  CHECK-DAG:   %[[C1:.+]] = constant 1 : index
733//  CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
734//  CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
735//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[D0]], %[[D1]]]
736//      CHECK:   %[[GENERIC1:.+]] = linalg.generic
737// CHECK-SAME:     outs(%[[INIT]] : tensor<?x?xf32>)
738//  CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[GENERIC1]], %[[C0]]
739//  CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[GENERIC1]], %[[C1]]
740//  CHECK-DAG:   %[[INIT:.+]] = linalg.init_tensor [%[[D0]], %[[D1]]]
741//      CHECK:   %[[RESULT:.+]] = linalg.generic
742// CHECK-SAME:     outs(%[[INIT]] : tensor<?x?xf32>)
743
744// -----
745
746func @fuse_scalar_constant(%arg0 : tensor<?x?xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
747  %cst = constant 4.0 : f32
748  %c42 = constant 42 : i32
749  %c0 = constant 0 : index
750  %c1 = constant 1 : index
751  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
752  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
753  %0 = linalg.init_tensor[%d0, %d1] : tensor<?x?xf32>
754  %1 = linalg.init_tensor[%d0, %d1] : tensor<?x?xi32>
755  %2:2 = linalg.generic {
756      indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
757                       affine_map<(d0, d1) -> ()>,
758                       affine_map<(d0, d1) -> ()>,
759                       affine_map<(d0, d1) -> (d0, d1)>,
760                       affine_map<(d0, d1) -> (d0, d1)>],
761      iterator_types = ["parallel", "parallel"]}
762      ins(%arg0, %cst, %c42 : tensor<?x?xf32>, f32, i32)
763      outs(%0, %1 : tensor<?x?xf32>, tensor<?x?xi32>) {
764      ^bb0(%arg1 : f32, %arg2 : f32, %arg3 : i32, %arg4 : f32, %arg5 : i32) :
765        %3 = addf %arg1, %arg2 : f32
766        linalg.yield %3, %arg3 : f32, i32
767      } -> (tensor<?x?xf32>, tensor<?x?xi32>)
768  return %2#0, %2#1 : tensor<?x?xf32>, tensor<?x?xi32>
769}
770// CHECK-LABEL: func @fuse_scalar_constant
771//   CHECK-DAG:   %[[CST:.+]] = constant 4.000000e+00 : f32
772//   CHECK-DAG:   %[[C42:.+]] = constant 42 : i32
773//       CHECK:   linalg.generic
774//  CHECK-SAME:       ins(%{{.+}} : tensor<?x?xf32>)
775//       CHECK:     %[[YIELD:.+]] = addf %{{.+}}, %[[CST]] : f32
776//       CHECK:     linalg.yield %[[YIELD]], %[[C42]] : f32, i32
777
778// -----
779
780// CHECK-LABEL: @transpose_fold_2d_fp32
781func @transpose_fold_2d_fp32(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
782  %input = constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
783  //               CHECK: %[[CST:.+]] = constant
784  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf32>
785  %1 = linalg.generic {
786    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
787    iterator_types = ["parallel", "parallel"]
788  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
789  ^bb0(%arg1: f32, %arg2: f32):
790    linalg.yield %arg1 : f32
791  } -> tensor<3x2xf32>
792  // CHECK: return %[[CST]]
793  return %1 : tensor<3x2xf32>
794}
795
796// -----
797
798// CHECK-LABEL: @transpose_fold_2d_fp64
799func @transpose_fold_2d_fp64(%init: tensor<3x2xf64>) -> tensor<3x2xf64> {
800  %input = constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf64>
801  //               CHECK: %[[CST:.+]] = constant
802  // CHECK-SAME{LITERAL}:   dense<[[0.000000e+00, 3.000000e+00], [1.000000e+00, 4.000000e+00], [2.000000e+00, 5.000000e+00]]> : tensor<3x2xf64>
803  %1 = linalg.generic {
804    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
805    iterator_types = ["parallel", "parallel"]
806  } ins(%input : tensor<2x3xf64>) outs(%init : tensor<3x2xf64>) {
807  ^bb0(%arg1: f64, %arg2: f64):
808    linalg.yield %arg1 : f64
809  } -> tensor<3x2xf64>
810  // CHECK: return %[[CST]]
811  return %1 : tensor<3x2xf64>
812}
813
814// -----
815
816// CHECK-LABEL: @transpose_fold_4d_i32
817func @transpose_fold_4d_i32(%init: tensor<3x1x4x2xi32>) -> tensor<3x1x4x2xi32> {
818  %input = constant dense<[[
819    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
820    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
821  ]]> : tensor<1x2x3x4xi32>
822  //               CHECK: %[[CST:.+]] = constant dense<[
823  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
824  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
825  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
826  // CHECK-SAME{LITERAL}: ]>
827  %1 = linalg.generic {
828    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
829    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
830  } ins(%input : tensor<1x2x3x4xi32>) outs(%init : tensor<3x1x4x2xi32>) {
831  ^bb0(%arg1: i32, %arg2: i32):
832    linalg.yield %arg1 : i32
833  } -> tensor<3x1x4x2xi32>
834  // CHECK: return %[[CST]]
835  return %1 : tensor<3x1x4x2xi32>
836}
837
838// -----
839
840// CHECK-LABEL: @transpose_fold_4d_i16
841func @transpose_fold_4d_i16(%init: tensor<3x1x4x2xi16>) -> tensor<3x1x4x2xi16> {
842  %input = constant dense<[[
843    [[ 0,  1,  2,  3], [ 4,  5,  6,  7], [ 8,  9, 10, 11]],
844    [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
845  ]]> : tensor<1x2x3x4xi16>
846  //               CHECK: %[[CST:.+]] = constant dense<[
847  // CHECK-SAME{LITERAL}:   [[[0, 12], [1, 13], [2, 14], [3, 15]]],
848  // CHECK-SAME{LITERAL}:   [[[4, 16], [5, 17], [6, 18], [7, 19]]],
849  // CHECK-SAME{LITERAL}:   [[[8, 20], [9, 21], [10, 22], [11, 23]]]
850  // CHECK-SAME{LITERAL}: ]>
851  %1 = linalg.generic {
852    indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d2, d0, d3, d1)>],
853    iterator_types = ["parallel", "parallel", "parallel", "parallel"]
854  } ins(%input : tensor<1x2x3x4xi16>) outs(%init : tensor<3x1x4x2xi16>) {
855  ^bb0(%arg1: i16, %arg2: i16):
856    linalg.yield %arg1 : i16
857  } -> tensor<3x1x4x2xi16>
858  // CHECK: return %[[CST]]
859  return %1 : tensor<3x1x4x2xi16>
860}
861
862// -----
863
864// CHECK-LABEL: @transpose_nofold_non_cst_input
865func @transpose_nofold_non_cst_input(%input: tensor<2x3xf32>, %init: tensor<3x2xf32>) -> tensor<3x2xf32> {
866  // CHECK: linalg.generic
867  %1 = linalg.generic {
868    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
869    iterator_types = ["parallel", "parallel"]
870  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
871  ^bb0(%arg1: f32, %arg2: f32):
872    linalg.yield %arg1 : f32
873  } -> tensor<3x2xf32>
874  return %1 : tensor<3x2xf32>
875}
876
877// -----
878
879// CHECK-LABEL: @transpose_nofold_yield_const
880func @transpose_nofold_yield_const(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
881  %input = constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
882  %cst = constant 8.0 : f32
883  // CHECK: linalg.generic
884  %1 = linalg.generic {
885    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
886    iterator_types = ["parallel", "parallel"]
887  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
888  ^bb0(%arg1: f32, %arg2: f32):
889    linalg.yield %cst : f32
890  } -> tensor<3x2xf32>
891  return %1 : tensor<3x2xf32>
892}
893
894// -----
895
896// CHECK-LABEL: @transpose_nofold_multi_ops_in_region
897func @transpose_nofold_multi_ops_in_region(%init: tensor<3x2xf32>) -> tensor<3x2xf32> {
898  %input = constant dense<[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]]> : tensor<2x3xf32>
899  // CHECK: linalg.generic
900  %1 = linalg.generic {
901    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
902    iterator_types = ["parallel", "parallel"]
903  } ins(%input : tensor<2x3xf32>) outs(%init : tensor<3x2xf32>) {
904  ^bb0(%arg1: f32, %arg2: f32):
905    %add = addf %arg1, %arg1 : f32
906    linalg.yield %add : f32
907  } -> tensor<3x2xf32>
908  return %1 : tensor<3x2xf32>
909}
910