1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s 3; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s 4 5declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)*, i8 addrspace(1)*, i32, i1 immarg) 6 7define amdgpu_cs void @memcpy_p1i8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) { 8; LOOP-LABEL: memcpy_p1i8: 9; LOOP: ; %bb.0: 10; LOOP-NEXT: s_mov_b32 s6, 0 11; LOOP-NEXT: s_mov_b32 s7, 0xf000 12; LOOP-NEXT: s_mov_b64 s[4:5], 0 13; LOOP-NEXT: v_mov_b32_e32 v5, v3 14; LOOP-NEXT: v_mov_b32_e32 v4, v2 15; LOOP-NEXT: v_mov_b32_e32 v7, v1 16; LOOP-NEXT: v_mov_b32_e32 v6, v0 17; LOOP-NEXT: v_mov_b32_e32 v8, s6 18; LOOP-NEXT: BB0_1: ; %load-store-loop 19; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1 20; LOOP-NEXT: buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64 21; LOOP-NEXT: buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1 22; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2 23; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3 24; LOOP-NEXT: buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4 25; LOOP-NEXT: buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5 26; LOOP-NEXT: buffer_load_ubyte v15, v[4:5], s[4:7], 0 addr64 offset:6 27; LOOP-NEXT: buffer_load_ubyte v16, v[4:5], s[4:7], 0 addr64 offset:7 28; LOOP-NEXT: buffer_load_ubyte v17, v[4:5], s[4:7], 0 addr64 offset:8 29; LOOP-NEXT: s_waitcnt expcnt(6) 30; LOOP-NEXT: buffer_load_ubyte v18, v[4:5], s[4:7], 0 addr64 offset:9 31; LOOP-NEXT: s_waitcnt expcnt(5) 32; LOOP-NEXT: buffer_load_ubyte v19, v[4:5], s[4:7], 0 addr64 offset:10 33; LOOP-NEXT: s_waitcnt expcnt(4) 34; LOOP-NEXT: buffer_load_ubyte v20, v[4:5], s[4:7], 0 addr64 offset:11 35; LOOP-NEXT: s_waitcnt expcnt(3) 36; LOOP-NEXT: buffer_load_ubyte v21, v[4:5], s[4:7], 0 addr64 offset:12 37; LOOP-NEXT: s_waitcnt expcnt(2) 38; LOOP-NEXT: buffer_load_ubyte v22, v[4:5], s[4:7], 0 addr64 offset:13 39; LOOP-NEXT: s_waitcnt expcnt(1) 40; LOOP-NEXT: buffer_load_ubyte v23, v[4:5], s[4:7], 0 addr64 offset:14 41; LOOP-NEXT: s_waitcnt expcnt(0) 42; LOOP-NEXT: buffer_load_ubyte v24, v[4:5], s[4:7], 0 addr64 offset:15 43; LOOP-NEXT: v_add_i32_e32 v8, vcc, 1, v8 44; LOOP-NEXT: s_xor_b64 s[0:1], vcc, -1 45; LOOP-NEXT: s_xor_b64 s[0:1], s[0:1], -1 46; LOOP-NEXT: s_and_b64 vcc, s[0:1], exec 47; LOOP-NEXT: s_waitcnt vmcnt(14) 48; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 49; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:1 50; LOOP-NEXT: s_waitcnt vmcnt(14) 51; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:2 52; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:3 53; LOOP-NEXT: s_waitcnt vmcnt(14) 54; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:4 55; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:5 56; LOOP-NEXT: s_waitcnt vmcnt(14) 57; LOOP-NEXT: buffer_store_byte v15, v[6:7], s[4:7], 0 addr64 offset:6 58; LOOP-NEXT: buffer_store_byte v16, v[6:7], s[4:7], 0 addr64 offset:7 59; LOOP-NEXT: s_waitcnt vmcnt(14) 60; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:8 61; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9 62; LOOP-NEXT: s_waitcnt vmcnt(14) 63; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:10 64; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:11 65; LOOP-NEXT: s_waitcnt vmcnt(14) 66; LOOP-NEXT: buffer_store_byte v21, v[6:7], s[4:7], 0 addr64 offset:12 67; LOOP-NEXT: buffer_store_byte v22, v[6:7], s[4:7], 0 addr64 offset:13 68; LOOP-NEXT: s_waitcnt vmcnt(14) 69; LOOP-NEXT: buffer_store_byte v23, v[6:7], s[4:7], 0 addr64 offset:14 70; LOOP-NEXT: buffer_store_byte v24, v[6:7], s[4:7], 0 addr64 offset:15 71; LOOP-NEXT: v_add_i32_e64 v6, s[0:1], 16, v6 72; LOOP-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1] 73; LOOP-NEXT: v_add_i32_e64 v4, s[0:1], 16, v4 74; LOOP-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v5, s[0:1] 75; LOOP-NEXT: s_cbranch_vccnz BB0_1 76; LOOP-NEXT: ; %bb.2: ; %memcpy-split 77; LOOP-NEXT: s_mov_b32 s2, 0 78; LOOP-NEXT: s_mov_b32 s3, 0xf000 79; LOOP-NEXT: s_mov_b64 s[0:1], 0 80; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:16 81; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:17 82; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:18 83; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19 84; LOOP-NEXT: s_waitcnt vmcnt(3) 85; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:16 86; LOOP-NEXT: s_waitcnt vmcnt(3) 87; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:17 88; LOOP-NEXT: s_waitcnt vmcnt(3) 89; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:18 90; LOOP-NEXT: s_waitcnt vmcnt(3) 91; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19 92; LOOP-NEXT: s_endpgm 93; 94; UNROLL-LABEL: memcpy_p1i8: 95; UNROLL: ; %bb.0: 96; UNROLL-NEXT: s_mov_b32 s2, 0 97; UNROLL-NEXT: s_mov_b32 s3, 0xf000 98; UNROLL-NEXT: s_mov_b64 s[0:1], 0 99; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 100; UNROLL-NEXT: s_waitcnt vmcnt(0) 101; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 102; UNROLL-NEXT: s_waitcnt expcnt(0) 103; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1 104; UNROLL-NEXT: s_waitcnt vmcnt(0) 105; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1 106; UNROLL-NEXT: s_waitcnt expcnt(0) 107; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:2 108; UNROLL-NEXT: s_waitcnt vmcnt(0) 109; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:2 110; UNROLL-NEXT: s_waitcnt expcnt(0) 111; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:3 112; UNROLL-NEXT: s_waitcnt vmcnt(0) 113; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:3 114; UNROLL-NEXT: s_waitcnt expcnt(0) 115; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:4 116; UNROLL-NEXT: s_waitcnt vmcnt(0) 117; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:4 118; UNROLL-NEXT: s_waitcnt expcnt(0) 119; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:5 120; UNROLL-NEXT: s_waitcnt vmcnt(0) 121; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:5 122; UNROLL-NEXT: s_waitcnt expcnt(0) 123; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:6 124; UNROLL-NEXT: s_waitcnt vmcnt(0) 125; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:6 126; UNROLL-NEXT: s_waitcnt expcnt(0) 127; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:7 128; UNROLL-NEXT: s_waitcnt vmcnt(0) 129; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:7 130; UNROLL-NEXT: s_waitcnt expcnt(0) 131; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:8 132; UNROLL-NEXT: s_waitcnt vmcnt(0) 133; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:8 134; UNROLL-NEXT: s_waitcnt expcnt(0) 135; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:9 136; UNROLL-NEXT: s_waitcnt vmcnt(0) 137; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:9 138; UNROLL-NEXT: s_waitcnt expcnt(0) 139; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:10 140; UNROLL-NEXT: s_waitcnt vmcnt(0) 141; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:10 142; UNROLL-NEXT: s_waitcnt expcnt(0) 143; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:11 144; UNROLL-NEXT: s_waitcnt vmcnt(0) 145; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:11 146; UNROLL-NEXT: s_waitcnt expcnt(0) 147; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:12 148; UNROLL-NEXT: s_waitcnt vmcnt(0) 149; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:12 150; UNROLL-NEXT: s_waitcnt expcnt(0) 151; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:13 152; UNROLL-NEXT: s_waitcnt vmcnt(0) 153; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:13 154; UNROLL-NEXT: s_waitcnt expcnt(0) 155; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:14 156; UNROLL-NEXT: s_waitcnt vmcnt(0) 157; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:14 158; UNROLL-NEXT: s_waitcnt expcnt(0) 159; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:15 160; UNROLL-NEXT: s_waitcnt vmcnt(0) 161; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:15 162; UNROLL-NEXT: s_waitcnt expcnt(0) 163; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:16 164; UNROLL-NEXT: s_waitcnt vmcnt(0) 165; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:16 166; UNROLL-NEXT: s_waitcnt expcnt(0) 167; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:17 168; UNROLL-NEXT: s_waitcnt vmcnt(0) 169; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:17 170; UNROLL-NEXT: s_waitcnt expcnt(0) 171; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:18 172; UNROLL-NEXT: s_waitcnt vmcnt(0) 173; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18 174; UNROLL-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19 175; UNROLL-NEXT: s_waitcnt vmcnt(0) 176; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19 177; UNROLL-NEXT: s_endpgm 178 call void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 20, i1 false) 179 ret void 180} 181 182