1// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-pipeline-data-transfer | FileCheck %s 2 3// ----- 4 5// CHECK-DAG: [[$MOD_2:#map[0-9]+]] = affine_map<(d0) -> (d0 mod 2)> 6// CHECK-DAG: [[$MAP_MINUS_1:#map[0-9]+]] = affine_map<(d0) -> (d0 - 1)> 7 8// CHECK-LABEL: func @loop_nest_dma() { 9func @loop_nest_dma() { 10 11 %A = alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0> 12 %Ah = alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1> 13 14 %tag = alloc() : memref<1 x f32> 15 16 %zero = constant 0 : index 17 %num_elts = constant 32 : index 18 19 affine.for %i = 0 to 8 { 20 affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32> 21 affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32> 22 %v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1> 23 %r = "compute"(%v) : (f32) -> (f32) 24 affine.store %r, %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1> 25 affine.for %j = 0 to 32 { 26 "do_more_compute"(%i, %j) : (index, index) -> () 27 } 28 } 29 dealloc %tag : memref<1 x f32> 30 dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1> 31 return 32} 33// CHECK: %{{.*}} = alloc() : memref<256xf32> 34// CHECK: %{{.*}} = alloc() : memref<2x32xf32, 1> 35// CHECK-NEXT: %{{.*}} = alloc() : memref<2x1xf32> 36// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> 37// CHECK-NEXT: affine.for %{{.*}} = 1 to 8 { 38// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}} mod 2, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<256xf32>, memref<2x32xf32, 1>, memref<2x1xf32> 39// CHECK-NEXT: affine.apply [[$MAP_MINUS_1]](%{{.*}}) 40// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}}) 41// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}}) 42// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32> 43// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> 44// CHECK-NEXT: "compute"(%{{.*}}) : (f32) -> f32 45// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> 46// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { 47// CHECK-NEXT: "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> () 48// CHECK-NEXT: } 49// CHECK-NEXT: } 50// CHECK-NEXT: affine.apply [[$MAP_MINUS_1]](%{{.*}}) 51// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}}) 52// CHECK-NEXT: affine.apply [[$MOD_2]](%{{.*}}) 53// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xf32> 54// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> 55// CHECK-NEXT: "compute"(%{{.*}}) : (f32) -> f32 56// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} mod 2, %{{.*}}] : memref<2x32xf32, 1> 57// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { 58// CHECK-NEXT: "do_more_compute"(%{{.*}}, %{{.*}}) : (index, index) -> () 59// CHECK-NEXT: } 60// CHECK-NEXT: dealloc %{{.*}} : memref<2x1xf32> 61// CHECK-NEXT: dealloc %{{.*}} : memref<2x32xf32, 1> 62// CHECK-NEXT: return 63// CHECK-NEXT:} 64 65// ----- 66 67// CHECK-DAG: [[$FLOOR_MOD_2:#map[0-9]+]] = affine_map<(d0) -> ((d0 floordiv 4) mod 2)> 68// CHECK-DAG: [[$REMAP_SHIFT_MINUS_4:#map[0-9]+]] = affine_map<(d0) -> (d0 - 4)> 69 70// CHECK-LABEL: @loop_step 71func @loop_step(%arg0: memref<512xf32>, 72 %arg1: memref<512xf32>) { 73 %c0 = constant 0 : index 74 %c4 = constant 4 : index 75 affine.for %i0 = 0 to 512 step 4 { 76 %1 = alloc() : memref<4xf32, 1> 77 %2 = alloc() : memref<1xi32> 78 affine.dma_start %arg0[%i0], %1[%c0], %2[%c0], %c4, 79 : memref<512xf32>, memref<4xf32, 1>, memref<1xi32> 80 affine.dma_wait %2[%c0], %c4 : memref<1xi32> 81 "compute"(%i0) : (index) -> () 82 dealloc %2 : memref<1xi32> 83 dealloc %1 : memref<4xf32, 1> 84 } 85 return 86} 87// CHECK: [[BUF:%[0-9]+]] = alloc() : memref<2x4xf32, 1> 88// CHECK: [[TAG:%[0-9]+]] = alloc() : memref<2x1xi32> 89// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> 90// CHECK-NEXT: affine.for %{{.*}} = 4 to 512 step 4 { 91// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[(%{{.*}} floordiv 4) mod 2, 0], [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<512xf32>, memref<2x4xf32, 1>, memref<2x1xi32> 92// CHECK-NEXT: affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}}) 93// CHECK-NEXT: affine.apply [[$FLOOR_MOD_2]](%{{.*}}) 94// CHECK: affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32> 95// CHECK-NEXT: "compute"(%{{.*}}) : (index) -> () 96// CHECK-NEXT: } 97// CHECK-NEXT: [[SHIFTED:%[0-9]+]] = affine.apply [[$REMAP_SHIFT_MINUS_4]](%{{.*}}) 98// CHECK-NEXT: %{{.*}} = affine.apply [[$FLOOR_MOD_2]]([[SHIFTED]]) 99// CHECK: affine.dma_wait [[TAG]][(%{{.*}} floordiv 4) mod 2, 0], %{{.*}} : memref<2x1xi32> 100// CHECK-NEXT: "compute"(%{{.*}}) : (index) -> () 101// CHECK-NEXT: dealloc [[TAG]] : memref<2x1xi32> 102// CHECK-NEXT: dealloc [[BUF]] : memref<2x4xf32, 1> 103// CHECK-NEXT: return 104// CHECK-NEXT: } 105 106// ----- 107 108#map1 = affine_map<(d0, d1) -> ((d0 * 2048 + d1 * 256) floordiv 32)> 109#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)> 110// CHECK-LABEL: func @loop_dma_nested(%{{.*}}: memref<512x32xvector<8xf32> 111func @loop_dma_nested(%arg0: memref<512x32xvector<8xf32>>, %arg1: memref<512x32xvector<8xf32>>, %arg2: memref<512x32xvector<8xf32>>) { 112 %num_elts = constant 256 : index 113 %c0 = constant 0 : index 114 %0 = alloc() : memref<64x4xvector<8xf32>, 2> 115 %1 = alloc() : memref<64x4xvector<8xf32>, 2> 116 %2 = alloc() : memref<64x4xvector<8xf32>, 2> 117 %3 = alloc() : memref<2xi32> 118 %4 = alloc() : memref<2xi32> 119 %5 = alloc() : memref<2xi32> 120 // Prologue for DMA overlap on arg2. 121 // CHECK-DAG: [[BUF_ARG2:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2> 122 // CHECK-DAG: [[TAG_ARG2:%[0-9]+]] = alloc() : memref<2x2xi32> 123 // CHECK: affine.dma_start %{{.*}}[ 124 // CHECK: affine.for %{{.*}} = 1 to 8 { 125 affine.for %i0 = 0 to 8 { 126 %6 = affine.apply #map2(%i0) 127 affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32> 128 affine.dma_wait %5[%c0], %num_elts : memref<2xi32> 129 // Steady state for DMA overlap on arg2 130 // CHECK: affine.dma_start %{{.*}}[ 131 // CHECK: affine.dma_wait [[TAG_ARG2]] 132 // Prologue for DMA overlap on arg0, arg1 nested within i0 133 // CHECK: [[BUF_ARG0:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2> 134 // CHECK: [[BUF_ARG1:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2> 135 // CHECK: [[TAG_ARG0:%[0-9]+]] = alloc() : memref<2x2xi32> 136 // CHECK: [[TAG_ARG1:%[0-9]+]] = alloc() : memref<2x2xi32> 137 // CHECK: affine.dma_start %{{.*}}[ 138 // CHECK: affine.dma_start %{{.*}}[ 139 // CHECK-NEXT: affine.for %{{.*}} = 1 to 8 { 140 affine.for %i1 = 0 to 8 { 141 %7 = affine.apply #map1(%i0, %i1) 142 %8 = affine.apply #map2(%i1) 143 affine.dma_start %arg0[%7, %c0], %0[%c0, %c0], %3[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32> 144 affine.dma_start %arg1[%8, %c0], %1[%c0, %c0], %4[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32> 145 affine.dma_wait %3[%c0], %num_elts : memref<2xi32> 146 affine.dma_wait %4[%c0], %num_elts : memref<2xi32> 147 // Steady state for DMA overlap on arg0, arg1 148 // CHECK: affine.dma_start %{{.*}}[ 149 // CHECK: affine.dma_start %{{.*}}[ 150 // CHECK: affine.dma_wait [[TAG_ARG0]] 151 // CHECK: affine.dma_wait [[TAG_ARG1]] 152 // CHECK-NEXT: affine.for %{{.*}} = 0 to 4 { 153 affine.for %i2 = 0 to 4 { 154 "foo"() : () -> () 155 } 156 } 157 // epilogue for arg0, arg1 158 // CHECK: affine.dma_wait [[TAG_ARG0]] 159 // CHECK: affine.dma_wait [[TAG_ARG1]] 160 // CHECK-DAG: dealloc [[TAG_ARG1]] : memref<2x2xi32> 161 // CHECK-DAG: dealloc [[TAG_ARG0]] : memref<2x2xi32> 162 // CHECK-DAG: dealloc [[BUF_ARG1]] : memref<2x64x4xvector<8xf32>, 2> 163 // CHECK-DAG: dealloc [[BUF_ARG0]] : memref<2x64x4xvector<8xf32>, 2> 164 // epilogue for DMA overlap on %arg2 165 // CHECK: affine.dma_wait [[TAG_ARG2]] 166 // Within the epilogue for arg2's DMA, we have the DMAs on %arg1, %arg2 nested. 167 // CHECK: [[BUF_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2> 168 // CHECK: [[BUF_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x64x4xvector<8xf32>, 2> 169 // CHECK: [[TAG_ARG0_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32> 170 // CHECK: [[TAG_ARG1_NESTED:%[0-9]+]] = alloc() : memref<2x2xi32> 171 // CHECK: affine.dma_start %{{.*}}[ 172 // CHECK: affine.dma_start %{{.*}}[ 173 // CHECK: affine.for %{{.*}} = 1 to 8 { 174 // CHECK: affine.dma_start %{{.*}}[ 175 // CHECK: affine.dma_start %{{.*}}[ 176 // CHECK: affine.dma_wait [[TAG_ARG0_NESTED]] 177 // CHECK: affine.dma_wait [[TAG_ARG1_NESTED]] 178 // CHECK: affine.for %{{.*}} = 0 to 4 { 179 // CHECK: "foo"() : () -> () 180 // CHECK: affine.dma_wait [[TAG_ARG0_NESTED]] 181 // CHECK: affine.dma_wait [[TAG_ARG1_NESTED]] 182 // CHECK: affine.for %{{.*}} = 0 to 4 { 183 } 184 dealloc %5 : memref<2xi32> 185 dealloc %4 : memref<2xi32> 186 dealloc %3 : memref<2xi32> 187 dealloc %2 : memref<64x4xvector<8xf32>, 2> 188 dealloc %1 : memref<64x4xvector<8xf32>, 2> 189 dealloc %0 : memref<64x4xvector<8xf32>, 2> 190 return 191// CHECK: } 192// CHECK-DAG: dealloc [[TAG_ARG1_NESTED]] : memref<2x2xi32> 193// CHECK-DAG: dealloc [[TAG_ARG0_NESTED]] : memref<2x2xi32> 194// CHECK-DAG: dealloc [[BUF_ARG1_NESTED]] : memref<2x64x4xvector<8xf32>, 2> 195// CHECK-DAG: dealloc [[BUF_ARG0_NESTED]] : memref<2x64x4xvector<8xf32>, 2> 196// CHECK-DAG: dealloc [[TAG_ARG2]] : memref<2x2xi32> 197// CHECK-DAG: dealloc [[BUF_ARG2]] : memref<2x64x4xvector<8xf32>, 2> 198// CHECK-NEXT: return 199} 200 201// ----- 202#map2 = affine_map<(d0) -> ((d0 * 2048) floordiv 32)> 203 204// CHECK: func @loop_dma_dependent 205func @loop_dma_dependent(%arg2: memref<512x32xvector<8xf32>>) { 206 %num_elts = constant 256 : index 207 %c0 = constant 0 : index 208 %0 = alloc() : memref<64x4xvector<8xf32>, 2> 209 %1 = alloc() : memref<64x4xvector<8xf32>, 2> 210 %2 = alloc() : memref<64x4xvector<8xf32>, 2> 211 %3 = alloc() : memref<2xi32> 212 %4 = alloc() : memref<2xi32> 213 %5 = alloc() : memref<2xi32> 214 215 // The two DMAs below are dependent (incoming and outgoing on the same 216 // memref) in the same iteration; so no pipelining here. 217 // CHECK-NOT: affine.dma_start 218 // CHECK: affine.for %{{.*}} = 0 to 8 { 219 affine.for %i0 = 0 to 8 { 220 %6 = affine.apply #map2(%i0) 221 affine.dma_start %arg2[%6, %c0], %2[%c0, %c0], %5[%c0], %num_elts : memref<512x32xvector<8xf32>>, memref<64x4xvector<8xf32>, 2>, memref<2xi32> 222 affine.dma_wait %5[%c0], %num_elts : memref<2xi32> 223 224 affine.dma_start %2[%c0, %c0], %arg2[%6, %c0], %5[%c0], %num_elts : memref<64x4xvector<8xf32>, 2>, memref<512x32xvector<8xf32>>, memref<2xi32> 225 affine.dma_wait %5[%c0], %num_elts : memref<2xi32> 226 } 227 dealloc %5 : memref<2xi32> 228 dealloc %4 : memref<2xi32> 229 dealloc %3 : memref<2xi32> 230 dealloc %2 : memref<64x4xvector<8xf32>, 2> 231 dealloc %1 : memref<64x4xvector<8xf32>, 2> 232 dealloc %0 : memref<64x4xvector<8xf32>, 2> 233 return 234} 235 236// ----- 237 238// CHECK-LABEL: func @escaping_use 239func @escaping_use(%arg0: memref<512 x 32 x f32>) { 240 %c32 = constant 32 : index 241 %num_elt = constant 512 : index 242 %zero = constant 0 : index 243 %Av = alloc() : memref<32 x 32 x f32, 2> 244 %tag = alloc() : memref<1 x i32> 245 246 // CHECK-NOT: affine.dma_start 247 // CHECK: affine.for %{{.*}} = 0 to 16 { 248 affine.for %kTT = 0 to 16 { 249 affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt : 250 memref<512 x 32 x f32>, 251 memref<32 x 32 x f32, 2>, memref<1 x i32> 252 affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32> 253 // escaping use; no DMA pipelining / double buffering will be done. 254 "foo"(%Av) : (memref<32 x 32 x f32, 2>) -> () 255 } 256 dealloc %tag : memref<1 x i32> 257 dealloc %Av : memref<32 x 32 x f32, 2> 258 return 259// CHECK: "foo"(%{{[0-9]+}}) : (memref<32x32xf32, 2>) -> () 260// CHECK: } 261// CHECK: return 262} 263 264// ----- 265 266// CHECK-LABEL: func @escaping_tag 267func @escaping_tag(%arg0: memref<512 x 32 x f32>) { 268 %c32 = constant 32 : index 269 %num_elt = constant 512 : index 270 %zero = constant 0 : index 271 %Av = alloc() : memref<32 x 32 x f32, 2> 272 %tag = alloc() : memref<1 x i32> 273 274 // CHECK-NOT: affine.dma_start 275 // CHECK: affine.for %{{.*}} = 0 to 16 { 276 affine.for %kTT = 0 to 16 { 277 affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt : 278 memref<512 x 32 x f32>, 279 memref<32 x 32 x f32, 2>, memref<1 x i32> 280 affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32> 281 // escaping use; no DMA pipelining / double buffering will be done. 282 "foo"(%tag) : (memref<1 x i32>) -> () 283 } 284 dealloc %tag : memref<1 x i32> 285 dealloc %Av : memref<32 x 32 x f32, 2> 286 return 287// CHECK: "foo"(%{{[0-9]+}}) : (memref<1xi32>) -> () 288// CHECK: } 289// CHECK: return 290} 291 292 293// ----- 294 295// CHECK-LABEL: func @live_out_use 296func @live_out_use(%arg0: memref<512 x 32 x f32>) -> f32 { 297 %c32 = constant 32 : index 298 %num_elt = constant 512 : index 299 %zero = constant 0 : index 300 %Av = alloc() : memref<32 x 32 x f32, 2> 301 %tag = alloc() : memref<1 x i32> 302 303 // CHECK-NOT: affine.dma_start 304 // CHECK: affine.for %{{.*}} = 0 to 16 { 305 affine.for %kTT = 0 to 16 { 306 affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt : 307 memref<512 x 32 x f32>, 308 memref<32 x 32 x f32, 2>, memref<1 x i32> 309 affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32> 310 } 311 // Use live out of 'affine.for' op; no DMA pipelining will be done. 312 %v = affine.load %Av[%zero, %zero] : memref<32 x 32 x f32, 2> 313 dealloc %tag : memref<1 x i32> 314 dealloc %Av : memref<32 x 32 x f32, 2> 315 return %v : f32 316// CHECK: affine.load %{{[0-9]+}}[%{{.*}}, %{{.*}}] : memref<32x32xf32, 2> 317// CHECK: return 318} 319 320// ----- 321 322// CHECK-LABEL: func @dynamic_shape_dma_buffer 323func @dynamic_shape_dma_buffer(%arg0: memref<512 x 32 x f32>) { 324 %c32 = constant 32 : index 325 %num_elt = constant 512 : index 326 %zero = constant 0 : index 327 328 %Av = alloc(%c32, %c32) : memref<? x ? x f32, 2> 329 %tag = alloc() : memref<1 x i32> 330 331// Double buffering for dynamic shaped buffer. 332// CHECK: alloc(%{{.*}}, %{{.*}}) : memref<?x?xf32, 2> 333// CHECK-NEXT: %[[C0:.*]] = constant 0 : index 334// CHECK-NEXT: dim %{{.*}}, %[[C0]] : memref<?x?xf32, 2> 335// CHECK-NEXT: %[[C1:.*]] = constant 1 : index 336// CHECK-NEXT: dim %{{.*}}, %[[C1]] : memref<?x?xf32, 2> 337// CHECK-NEXT: alloc(%{{.*}}, %{{.*}}) : memref<2x?x?xf32, 2> 338// CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} 339 affine.for %kTT = 0 to 16 { 340 affine.dma_start %arg0[%zero, %zero], %Av[%zero, %zero], %tag[%zero], %num_elt : 341 memref<512 x 32 x f32>, 342 memref<? x ? x f32, 2>, memref<1 x i32> 343 affine.dma_wait %tag[%zero], %num_elt : memref<1 x i32> 344 } 345 dealloc %Av : memref<? x ? x f32, 2> 346 return 347// CHECK-NEXT: affine.for %{{.*}} = 1 to 16 { 348// CHECK: affine.dma_start %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}} mod 2, 0, 0], %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} 349// CHECK: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32> 350// CHECK: } 351// CHECK: affine.dma_wait %{{.*}}[%{{.*}} mod 2, 0], %{{.*}} : memref<2x1xi32> 352// CHECK: return 353} 354 355// Memref replacement will fail here due to a non-dereferencing use. However, 356// no incorrect transformation is performed in spite of one of the uses being a 357// dereferencing one since replaceAllMemRefUsesWith checks for escaping uses 358// before performing any replacement. 359// CHECK-LABEL: func @escaping_and_indexed_use_mix 360func @escaping_and_indexed_use_mix() { 361 %A = alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0> 362 %Ah = alloc() : memref<32 x f32, affine_map<(d0) -> (d0)>, 1> 363 %tag = alloc() : memref<1 x f32> 364 %zero = constant 0 : index 365 %num_elts = constant 32 : index 366 367 // alloc for the buffer is created but no replacement should happen. 368 affine.for %i = 0 to 8 { 369 affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32> 370 affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32> 371 "compute"(%Ah) : (memref<32 x f32, 1>) -> () 372 %v = affine.load %Ah[%i] : memref<32 x f32, affine_map<(d0) -> (d0)>, 1> 373 "foo"(%v) : (f32) -> () 374 } 375 dealloc %A : memref<256 x f32, affine_map<(d0) -> (d0)>, 0> 376 dealloc %Ah : memref<32 x f32, affine_map<(d0) -> (d0)>, 1> 377 return 378} 379// No replacement. 380// CHECK: affine.for %{{.*}} = 0 to 8 { 381// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} 382// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32> 383// CHECK-NEXT: "compute"(%{{.*}}) : (memref<32xf32, 1>) -> () 384// CHECK-NEXT: [[VAL:%[0-9]+]] = affine.load %{{.*}}[%{{.*}}] : memref<32xf32, 1> 385// CHECK-NEXT: "foo"([[VAL]]) : (f32) -> () 386