1// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 skip-non-unit-stride-loops" | FileCheck %s 2// Small buffer size to trigger fine copies. 3// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 fast-mem-capacity=1" | FileCheck --check-prefix=CHECK-SMALL %s 4 5// Test affine data copy with a memref filter. We use a test pass that invokes 6// affine data copy utility on the input loop nest. 7// '-test-affine-data-copy-memref-filter' passes the first memref found in an 8// affine.load op in the innermost loop as a filter. 9// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER 10// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION 11 12// -copy-skip-non-stride-loops forces the copies to be placed right inside the 13// tile space loops, avoiding the sensitivity of copy placement depth to memory 14// footprint -- so that one could write a definite test case and not have to 15// update it each time something related to the cost functions change. 16 17#id = affine_map<(d0) -> (d0)> 18#ub = affine_map<(d0) -> (d0 + 128)> 19 20// Map used to index the buffer while computing. 21// CHECK-DAG: [[$MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)> 22// CHECK-DAG: [[$MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)> 23 24// CHECK-LABEL: func @matmul 25// FILTER-LABEL: func @matmul 26func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> { 27 affine.for %i = 0 to 4096 step 128 { 28 affine.for %j = 0 to 4096 step 128 { 29 affine.for %k = 0 to 4096 step 128 { 30 affine.for %ii = #id(%i) to #ub(%i) { 31 affine.for %jj = #id(%j) to #ub(%j) { 32 affine.for %kk = #id(%k) to #ub(%k) { 33 %5 = affine.load %A[%ii, %kk] : memref<4096x4096xf32> 34 %6 = affine.load %B[%kk, %jj] : memref<4096x4096xf32> 35 %7 = affine.load %C[%ii, %jj] : memref<4096x4096xf32> 36 %8 = mulf %5, %6 : f32 37 %9 = addf %7, %8 : f32 38 affine.store %9, %C[%ii, %jj] : memref<4096x4096xf32> 39 } 40 } 41 } 42 } 43 } 44 } 45 return %C : memref<4096x4096xf32> 46} 47 48// Buffers of size 128x128 get created here for all three matrices. 49 50// CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 { 51// CHECK: affine.for %[[J:.*]] = 0 to 4096 step 128 { 52// CHECK: [[BUFC:%[0-9]+]] = memref.alloc() : memref<128x128xf32> 53// The result matrix's copy gets hoisted out. 54// Result matrix copy-in. 55// CHECK: affine.for %[[II:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 56// CHECK: affine.for %[[JJ:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 57// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> 58// CHECK: affine.store %{{.*}}, [[BUFC]][%[[II]] - %[[I]], %[[JJ]] - %[[J]]] : memref<128x128xf32> 59// CHECK: } 60// CHECK: } 61 62// LHS matrix copy-in. 63// CHECK: affine.for %[[K:.*]] = 0 to 4096 step 128 { 64// CHECK: [[BUFA:%[0-9]+]] = memref.alloc() : memref<128x128xf32> 65// CHECK: affine.for %[[II:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 66// CHECK: affine.for %[[KK:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 67// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> 68// CHECK: affine.store %{{.*}}, [[BUFA]][%[[II]] - %[[I]], %[[KK]] - %[[K]]] : memref<128x128xf32> 69// CHECK: } 70// CHECK: } 71 72// RHS matrix copy-in. 73// CHECK: [[BUFB:%[0-9]+]] = memref.alloc() : memref<128x128xf32> 74// CHECK: affine.for %[[KK:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 75// CHECK: affine.for %[[JJ:.*]] = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 76// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> 77// CHECK: affine.store %{{.*}}, [[BUFB]][%[[KK]] - %[[K]], %[[JJ]] - %[[J]]] : memref<128x128xf32> 78// CHECK: } 79// CHECK: } 80 81// Computation on the fast buffers. 82// CHECK: affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 83// CHECK: affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 84// CHECK: affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 85// CHECK: affine.load [[BUFA]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32> 86// CHECK: affine.load [[BUFB]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32> 87// CHECK: affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32> 88// CHECK: mulf %{{.*}}, %{{.*}} : f32 89// CHECK: addf %{{.*}}, %{{.*}} : f32 90// CHECK: affine.store %{{.*}}, [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32> 91// CHECK: } 92// CHECK: } 93// CHECK: } 94// CHECK: memref.dealloc [[BUFB]] : memref<128x128xf32> 95// CHECK: memref.dealloc [[BUFA]] : memref<128x128xf32> 96// CHECK: } 97 98// Result matrix copy out. 99// CHECK: affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 100// CHECK: affine.for %{{.*}} = #[[$MAP_IDENTITY]](%{{.*}}) to #[[$MAP_PLUS_128]](%{{.*}}) { 101// CHECK: affine.load [[BUFC]][%{{.*}} - %{{.*}}, %{{.*}} - %{{.*}}] : memref<128x128xf32> 102// CHECK: store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> 103// CHECK: } 104// CHECK: } 105// CHECK: memref.dealloc [[BUFC]] : memref<128x128xf32> 106// CHECK: } 107// CHECK: } 108 109// Check that only one memref is copied when memref filter is used. 110 111// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { 112// FILTER: memref.alloc() : memref<128x4096xf32> 113// FILTER-NOT: memref.alloc() 114// FILTER: affine.for 115// FILTER: affine.for %{{.*}} = 0 to 4096 { 116// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { 117// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 { 118// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { 119// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { 120// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { 121// FILTER: memref.dealloc %{{.*}} : memref<128x4096xf32> 122// FILTER-NOT: memref.dealloc %{{.*}} : memref<128x4096xf32> 123 124// ----- 125 126// 127// This test case will lead to single element buffers. These are eventually 128// expected to be turned into registers via alloca and mem2reg. 129// 130// CHECK-SMALL-LABEL: func @single_elt_buffers 131// FILTER-LABEL: func @single_elt_buffers 132// MEMREF_REGION-LABEL: func @single_elt_buffers 133func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> { 134 affine.for %i = 0 to 1024 { 135 affine.for %j = 0 to 1024 { 136 affine.for %k = 0 to 1024 { 137 %6 = affine.load %arg1[%k, %j] : memref<1024x1024xf32> 138 %7 = affine.load %arg2[%i, %j] : memref<1024x1024xf32> 139 %9 = addf %6, %7 : f32 140 affine.store %9, %arg2[%i, %j] : memref<1024x1024xf32> 141 } 142 } 143 } 144 return %arg2 : memref<1024x1024xf32> 145} 146// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 { 147// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 { 148// CHECK-SMALL: memref.alloc() : memref<1x1xf32> 149// CHECK-SMALL: affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 150// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 151// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 { 152// CHECK-SMALL: memref.alloc() : memref<1x1xf32> 153// CHECK-SMALL: affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 154// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 155// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 156// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 157// CHECK-SMALL: addf %{{.*}}, %{{.*}} : f32 158// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> 159// CHECK-SMALL: memref.dealloc %{{.*}} : memref<1x1xf32> 160// CHECK-SMALL: } 161// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32> 162// CHECK-SMALL: affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> 163// CHECK-SMALL: memref.dealloc %{{.*}} : memref<1x1xf32> 164// CHECK-SMALL: } 165// CHECK-SMALL: } 166// CHECK-SMALL: return 167 168// Check that only one memref is copied when memref filter is used. 169 170// FILTER: memref.alloc() : memref<1024x1024xf32> 171// FILTER-NOT: memref.alloc() 172// FILTER: affine.for %{{.*}} = 0 to 1024 { 173// FILTER: affine.for %{{.*}} = 0 to 1024 { 174// FILTER: affine.for %{{.*}} = 0 to 1024 { 175// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { 176// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { 177// FILTER: memref.dealloc %{{.*}} : memref<1024x1024xf32> 178// FILTER-NOT: memref.dealloc 179// FILTER: return 180 181// CHeck that only one memref is copied, because for-memref-region is enabled 182// (and the first ever encountered load is analyzed). 183// MEMREF_REGION: memref.alloc() : memref<1024x1024xf32> 184// MEMREF_REGION-NOT: memref.alloc() 185// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { 186// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { 187// MEMREF_REGION: } 188// MEMREF_REGION: } 189// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { 190// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { 191// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { 192// MEMREF_REGION: memref.dealloc %{{.*}} : memref<1024x1024xf32> 193// MEMREF_REGION-NOT: memref.dealloc 194// MEMREF_REGION-NEXT: return 195 196// ----- 197 198// This pattern typically appears with tiling with tile sizes that don't divide 199// the loop trip counts. 200 201#map_ub = affine_map<(d0) -> (4096, d0 + 100)> 202 203// CHECK-DAG: [[$MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)> 204// CHECK-DAG: [[$MAP_MIN_UB1:map[0-9]+]] = affine_map<(d0) -> (d0 + 100, 4096)> 205// CHECK-DAG: [[$MAP_MIN_UB2:map[0-9]+]] = affine_map<(d0) -> (4096, d0 + 100)> 206 207// CHECK-LABEL: func @min_upper_bound 208func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> { 209 affine.for %i = 0 to 4096 step 100 { 210 affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) { 211 %5 = affine.load %A[%ii] : memref<4096xf32> 212 %6 = mulf %5, %5 : f32 213 affine.store %6, %A[%ii] : memref<4096xf32> 214 } 215 } 216 return %A : memref<4096xf32> 217} 218// CHECK: affine.for %[[IV1:.*]] = 0 to 4096 step 100 219// CHECK: %[[BUF:.*]] = memref.alloc() : memref<100xf32> 220// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) { 221// CHECK-NEXT: affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32> 222// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][%[[IV2]] - %[[IV1]]] : memref<100xf32> 223// CHECK-NEXT: } 224// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB2]](%[[IV1]]) { 225// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32> 226// CHECK-NEXT: mulf 227// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32> 228// CHECK-NEXT: } 229// CHECK: affine.for %[[IV2:.*]] = #[[$MAP_IDENTITY]](%[[IV1]]) to min #[[$MAP_MIN_UB1]](%[[IV1]]) { 230// CHECK-NEXT: affine.load %[[BUF]][%[[IV2]] - %[[IV1]]] : memref<100xf32> 231// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[IV2]]] : memref<4096xf32> 232// CHECK-NEXT: } 233// CHECK-NEXT: memref.dealloc %[[BUF]] : memref<100xf32> 234// CHECK-NEXT: } 235 236// ----- 237 238// Lower bound is a max; upper bound is a min. This pattern typically appears 239// with multi-level tiling when the tile sizes used don't divide loop trip 240// counts. 241 242#lb = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)> 243#ub = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)> 244 245// CHECK-DAG: #[[$LB:.*]] = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)> 246// CHECK-DAG: #[[$UB:.*]] = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)> 247 248// CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>, 249// CHECK-SAME: [[i:arg[0-9]+]] 250// CHECK-SAME: [[j:arg[0-9]+]] 251func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) { 252 affine.for %ii = 0 to 2048 { 253 affine.for %jj = max #lb()[%i, %j] to min #ub()[%i, %j] { 254 affine.load %M[%ii, %jj] : memref<2048x516xf64> 255 } 256 } 257 return 258} 259 260// CHECK: %[[BUF:.*]] = memref.alloc() : memref<2048x6xf64> 261// CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 { 262// CHECK-NEXT: affine.for %[[jj:.*]] = max #[[$LB]]()[%[[i]], %[[j]]] to min #[[$UB]]()[%[[i]], %[[j]]] { 263// CHECK-NEXT: affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64> 264// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) * 6] : memref<2048x6xf64> 265// CHECK-NEXT: } 266// CHECK-NEXT: } 267// CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 { 268// CHECK-NEXT: affine.for %[[jj_:.*]] = max #[[$LB]]()[%{{.*}}, %{{.*}}] to min #[[$UB]]()[%{{.*}}, %{{.*}}] { 269// CHECK-NEXT: affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64> 270// CHECK-NEXT: } 271// CHECK-NEXT: } 272// CHECK-NEXT: memref.dealloc %[[BUF]] : memref<2048x6xf64> 273 274// ----- 275 276// CHECK-LABEL: func @empty_loops 277func @empty_loops(%arg0: memref<1024x1024xf64>) { 278 // Empty loops - so no copy generation happens. 279 affine.for %i = 0 to 0 { 280 affine.load %arg0[0, %i] : memref<1024x1024xf64> 281 } 282 affine.for %i = 0 to -16 { 283 affine.load %arg0[0, %i] : memref<1024x1024xf64> 284 } 285 return 286 // CHECK-NOT: memref.alloc 287 // CHECK: return 288} 289