1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
3; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
4
5declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)*, i8 addrspace(1)*, i32, i1 immarg)
6
7define amdgpu_cs void @memcpy_p1i8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) {
8; LOOP-LABEL: memcpy_p1i8:
9; LOOP:       ; %bb.0:
10; LOOP-NEXT:    s_mov_b32 s6, 0
11; LOOP-NEXT:    s_mov_b32 s7, 0xf000
12; LOOP-NEXT:    s_mov_b64 s[4:5], 0
13; LOOP-NEXT:    v_mov_b32_e32 v5, v3
14; LOOP-NEXT:    v_mov_b32_e32 v4, v2
15; LOOP-NEXT:    v_mov_b32_e32 v7, v1
16; LOOP-NEXT:    v_mov_b32_e32 v6, v0
17; LOOP-NEXT:    v_mov_b32_e32 v8, s6
18; LOOP-NEXT:  BB0_1: ; %load-store-loop
19; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
20; LOOP-NEXT:    buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64
21; LOOP-NEXT:    buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1
22; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2
23; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3
24; LOOP-NEXT:    buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4
25; LOOP-NEXT:    buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5
26; LOOP-NEXT:    buffer_load_ubyte v15, v[4:5], s[4:7], 0 addr64 offset:6
27; LOOP-NEXT:    buffer_load_ubyte v16, v[4:5], s[4:7], 0 addr64 offset:7
28; LOOP-NEXT:    buffer_load_ubyte v17, v[4:5], s[4:7], 0 addr64 offset:8
29; LOOP-NEXT:    s_waitcnt expcnt(6)
30; LOOP-NEXT:    buffer_load_ubyte v18, v[4:5], s[4:7], 0 addr64 offset:9
31; LOOP-NEXT:    s_waitcnt expcnt(5)
32; LOOP-NEXT:    buffer_load_ubyte v19, v[4:5], s[4:7], 0 addr64 offset:10
33; LOOP-NEXT:    s_waitcnt expcnt(4)
34; LOOP-NEXT:    buffer_load_ubyte v20, v[4:5], s[4:7], 0 addr64 offset:11
35; LOOP-NEXT:    s_waitcnt expcnt(3)
36; LOOP-NEXT:    buffer_load_ubyte v21, v[4:5], s[4:7], 0 addr64 offset:12
37; LOOP-NEXT:    s_waitcnt expcnt(2)
38; LOOP-NEXT:    buffer_load_ubyte v22, v[4:5], s[4:7], 0 addr64 offset:13
39; LOOP-NEXT:    s_waitcnt expcnt(1)
40; LOOP-NEXT:    buffer_load_ubyte v23, v[4:5], s[4:7], 0 addr64 offset:14
41; LOOP-NEXT:    s_waitcnt expcnt(0)
42; LOOP-NEXT:    buffer_load_ubyte v24, v[4:5], s[4:7], 0 addr64 offset:15
43; LOOP-NEXT:    v_add_i32_e32 v8, vcc, 1, v8
44; LOOP-NEXT:    s_xor_b64 s[0:1], vcc, -1
45; LOOP-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
46; LOOP-NEXT:    s_and_b64 vcc, s[0:1], exec
47; LOOP-NEXT:    s_waitcnt vmcnt(14)
48; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[4:7], 0 addr64
49; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:1
50; LOOP-NEXT:    s_waitcnt vmcnt(14)
51; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:2
52; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:3
53; LOOP-NEXT:    s_waitcnt vmcnt(14)
54; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:4
55; LOOP-NEXT:    buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:5
56; LOOP-NEXT:    s_waitcnt vmcnt(14)
57; LOOP-NEXT:    buffer_store_byte v15, v[6:7], s[4:7], 0 addr64 offset:6
58; LOOP-NEXT:    buffer_store_byte v16, v[6:7], s[4:7], 0 addr64 offset:7
59; LOOP-NEXT:    s_waitcnt vmcnt(14)
60; LOOP-NEXT:    buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:8
61; LOOP-NEXT:    buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9
62; LOOP-NEXT:    s_waitcnt vmcnt(14)
63; LOOP-NEXT:    buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:10
64; LOOP-NEXT:    buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:11
65; LOOP-NEXT:    s_waitcnt vmcnt(14)
66; LOOP-NEXT:    buffer_store_byte v21, v[6:7], s[4:7], 0 addr64 offset:12
67; LOOP-NEXT:    buffer_store_byte v22, v[6:7], s[4:7], 0 addr64 offset:13
68; LOOP-NEXT:    s_waitcnt vmcnt(14)
69; LOOP-NEXT:    buffer_store_byte v23, v[6:7], s[4:7], 0 addr64 offset:14
70; LOOP-NEXT:    buffer_store_byte v24, v[6:7], s[4:7], 0 addr64 offset:15
71; LOOP-NEXT:    v_add_i32_e64 v6, s[0:1], 16, v6
72; LOOP-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1]
73; LOOP-NEXT:    v_add_i32_e64 v4, s[0:1], 16, v4
74; LOOP-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, v5, s[0:1]
75; LOOP-NEXT:    s_cbranch_vccnz BB0_1
76; LOOP-NEXT:  ; %bb.2: ; %memcpy-split
77; LOOP-NEXT:    s_mov_b32 s2, 0
78; LOOP-NEXT:    s_mov_b32 s3, 0xf000
79; LOOP-NEXT:    s_mov_b64 s[0:1], 0
80; LOOP-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:16
81; LOOP-NEXT:    buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:17
82; LOOP-NEXT:    buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:18
83; LOOP-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19
84; LOOP-NEXT:    s_waitcnt vmcnt(3)
85; LOOP-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:16
86; LOOP-NEXT:    s_waitcnt vmcnt(3)
87; LOOP-NEXT:    buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:17
88; LOOP-NEXT:    s_waitcnt vmcnt(3)
89; LOOP-NEXT:    buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:18
90; LOOP-NEXT:    s_waitcnt vmcnt(3)
91; LOOP-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19
92; LOOP-NEXT:    s_endpgm
93;
94; UNROLL-LABEL: memcpy_p1i8:
95; UNROLL:       ; %bb.0:
96; UNROLL-NEXT:    s_mov_b32 s2, 0
97; UNROLL-NEXT:    s_mov_b32 s3, 0xf000
98; UNROLL-NEXT:    s_mov_b64 s[0:1], 0
99; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
100; UNROLL-NEXT:    s_waitcnt vmcnt(0)
101; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
102; UNROLL-NEXT:    s_waitcnt expcnt(0)
103; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
104; UNROLL-NEXT:    s_waitcnt vmcnt(0)
105; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
106; UNROLL-NEXT:    s_waitcnt expcnt(0)
107; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:2
108; UNROLL-NEXT:    s_waitcnt vmcnt(0)
109; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:2
110; UNROLL-NEXT:    s_waitcnt expcnt(0)
111; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:3
112; UNROLL-NEXT:    s_waitcnt vmcnt(0)
113; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:3
114; UNROLL-NEXT:    s_waitcnt expcnt(0)
115; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:4
116; UNROLL-NEXT:    s_waitcnt vmcnt(0)
117; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:4
118; UNROLL-NEXT:    s_waitcnt expcnt(0)
119; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:5
120; UNROLL-NEXT:    s_waitcnt vmcnt(0)
121; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:5
122; UNROLL-NEXT:    s_waitcnt expcnt(0)
123; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:6
124; UNROLL-NEXT:    s_waitcnt vmcnt(0)
125; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:6
126; UNROLL-NEXT:    s_waitcnt expcnt(0)
127; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:7
128; UNROLL-NEXT:    s_waitcnt vmcnt(0)
129; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:7
130; UNROLL-NEXT:    s_waitcnt expcnt(0)
131; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:8
132; UNROLL-NEXT:    s_waitcnt vmcnt(0)
133; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:8
134; UNROLL-NEXT:    s_waitcnt expcnt(0)
135; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:9
136; UNROLL-NEXT:    s_waitcnt vmcnt(0)
137; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:9
138; UNROLL-NEXT:    s_waitcnt expcnt(0)
139; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:10
140; UNROLL-NEXT:    s_waitcnt vmcnt(0)
141; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:10
142; UNROLL-NEXT:    s_waitcnt expcnt(0)
143; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:11
144; UNROLL-NEXT:    s_waitcnt vmcnt(0)
145; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:11
146; UNROLL-NEXT:    s_waitcnt expcnt(0)
147; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:12
148; UNROLL-NEXT:    s_waitcnt vmcnt(0)
149; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:12
150; UNROLL-NEXT:    s_waitcnt expcnt(0)
151; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:13
152; UNROLL-NEXT:    s_waitcnt vmcnt(0)
153; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:13
154; UNROLL-NEXT:    s_waitcnt expcnt(0)
155; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:14
156; UNROLL-NEXT:    s_waitcnt vmcnt(0)
157; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:14
158; UNROLL-NEXT:    s_waitcnt expcnt(0)
159; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:15
160; UNROLL-NEXT:    s_waitcnt vmcnt(0)
161; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:15
162; UNROLL-NEXT:    s_waitcnt expcnt(0)
163; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:16
164; UNROLL-NEXT:    s_waitcnt vmcnt(0)
165; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:16
166; UNROLL-NEXT:    s_waitcnt expcnt(0)
167; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:17
168; UNROLL-NEXT:    s_waitcnt vmcnt(0)
169; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:17
170; UNROLL-NEXT:    s_waitcnt expcnt(0)
171; UNROLL-NEXT:    buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:18
172; UNROLL-NEXT:    s_waitcnt vmcnt(0)
173; UNROLL-NEXT:    buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18
174; UNROLL-NEXT:    buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:19
175; UNROLL-NEXT:    s_waitcnt vmcnt(0)
176; UNROLL-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19
177; UNROLL-NEXT:    s_endpgm
178  call void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 20, i1 false)
179  ret void
180}
181
182