1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
10; GFX7-LABEL: udot8_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
13; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
14; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
15; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
16; GFX7-NEXT:    s_mov_b32 s14, -1
17; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
18; GFX7-NEXT:    s_add_u32 s12, s12, s3
19; GFX7-NEXT:    s_mov_b32 s3, 0xf000
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
22; GFX7-NEXT:    s_mov_b32 s10, 0
23; GFX7-NEXT:    s_mov_b32 s11, s3
24; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
25; GFX7-NEXT:    v_mov_b32_e32 v1, 0
26; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
27; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
28; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
29; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
30; GFX7-NEXT:    s_mov_b32 s2, -1
31; GFX7-NEXT:    s_addc_u32 s13, s13, 0
32; GFX7-NEXT:    s_waitcnt vmcnt(1)
33; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
34; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
35; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
36; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
37; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
38; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
39; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
40; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
41; GFX7-NEXT:    s_waitcnt vmcnt(0)
42; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
43; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
44; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
45; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
46; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
47; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
48; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
49; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
50; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
52; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
53; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
54; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
55; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
56; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
57; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
58; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
59; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
60; GFX7-NEXT:    s_endpgm
61;
62; GFX8-LABEL: udot8_acc32:
63; GFX8:       ; %bb.0: ; %entry
64; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
65; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
66; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
67; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
68; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:    v_mov_b32_e32 v1, s5
71; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
72; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
73; GFX8-NEXT:    v_mov_b32_e32 v3, s7
74; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
75; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
76; GFX8-NEXT:    flat_load_dword v0, v[0:1]
77; GFX8-NEXT:    flat_load_dword v1, v[2:3]
78; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
79; GFX8-NEXT:    s_mov_b32 s10, -1
80; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
81; GFX8-NEXT:    s_add_u32 s8, s8, s3
82; GFX8-NEXT:    s_addc_u32 s9, s9, 0
83; GFX8-NEXT:    s_waitcnt vmcnt(1)
84; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 28, v0
85; GFX8-NEXT:    v_bfe_u32 v3, v0, 24, 4
86; GFX8-NEXT:    v_bfe_u32 v4, v0, 20, 4
87; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 4
88; GFX8-NEXT:    v_bfe_u32 v6, v0, 12, 4
89; GFX8-NEXT:    v_bfe_u32 v7, v0, 8, 4
90; GFX8-NEXT:    v_bfe_u32 v8, v0, 4, 4
91; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
92; GFX8-NEXT:    s_waitcnt vmcnt(0)
93; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v1
94; GFX8-NEXT:    v_bfe_u32 v10, v1, 24, 4
95; GFX8-NEXT:    v_bfe_u32 v11, v1, 20, 4
96; GFX8-NEXT:    v_bfe_u32 v12, v1, 16, 4
97; GFX8-NEXT:    v_bfe_u32 v13, v1, 12, 4
98; GFX8-NEXT:    v_bfe_u32 v14, v1, 8, 4
99; GFX8-NEXT:    v_bfe_u32 v15, v1, 4, 4
100; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
101; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, s2
103; GFX8-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
104; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
105; GFX8-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
106; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
107; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
108; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
109; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v9, v0
110; GFX8-NEXT:    v_mov_b32_e32 v0, s0
111; GFX8-NEXT:    v_mov_b32_e32 v1, s1
112; GFX8-NEXT:    flat_store_dword v[0:1], v2
113; GFX8-NEXT:    s_endpgm
114;
115; GFX9-LABEL: udot8_acc32:
116; GFX9:       ; %bb.0: ; %entry
117; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
118; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
119; GFX9-NEXT:    s_mov_b32 s10, -1
120; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
121; GFX9-NEXT:    s_add_u32 s8, s8, s3
122; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
123; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
124; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
125; GFX9-NEXT:    s_addc_u32 s9, s9, 0
126; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
128; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
129; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
130; GFX9-NEXT:    v_mov_b32_e32 v0, 0
131; GFX9-NEXT:    s_waitcnt vmcnt(1)
132; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
133; GFX9-NEXT:    s_waitcnt vmcnt(0)
134; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
135; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
136; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
137; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
138; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
139; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
140; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
141; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
142; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
143; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
144; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
145; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
146; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
147; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
148; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
149; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
150; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
151; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
152; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
155; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
156; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
157; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
158; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
159; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v3, v10
160; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
161; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
162; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
163; GFX9-NEXT:    s_endpgm
164;
165; GFX9-DL-LABEL: udot8_acc32:
166; GFX9-DL:       ; %bb.0: ; %entry
167; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
168; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
169; GFX9-DL-NEXT:    s_mov_b32 s10, -1
170; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
171; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
172; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
173; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
174; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
175; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
176; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
177; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
178; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
179; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
180; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
181; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
182; GFX9-DL-NEXT:    v_dot8_u32_u4 v0, v2, v3, s0
183; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
184; GFX9-DL-NEXT:    s_endpgm
185;
186; GFX10-DL-LABEL: udot8_acc32:
187; GFX10-DL:       ; %bb.0: ; %entry
188; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
189; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
190; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
191; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
192; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
193; GFX10-DL-NEXT:    s_mov_b32 s10, -1
194; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
195; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
196; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
197; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX10-DL-NEXT:    s_clause 0x1
199; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
200; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
201; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
202; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
203; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
204; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
205; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
206; GFX10-DL-NEXT:    s_endpgm
207                                       <8 x i4> addrspace(1)* %src2,
208                                       i32 addrspace(1)* nocapture %dst) {
209entry:
210  %idx = call i32 @llvm.amdgcn.workitem.id.x()
211  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
212  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
213  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
214  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
215
216  %v1e0 = extractelement <8 x i4> %vec1, i64 0
217  %cv1e0 = zext i4 %v1e0 to i32
218  %v2e0 = extractelement <8 x i4> %vec2, i64 0
219  %cv2e0 = zext i4 %v2e0 to i32
220  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
221
222  %v1e1 = extractelement <8 x i4> %vec1, i64 1
223  %cv1e1 = zext i4 %v1e1 to i32
224  %v2e1 = extractelement <8 x i4> %vec2, i64 1
225  %cv2e1 = zext i4 %v2e1 to i32
226  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
227
228  %v1e2 = extractelement <8 x i4> %vec1, i64 2
229  %cv1e2 = zext i4 %v1e2 to i32
230  %v2e2 = extractelement <8 x i4> %vec2, i64 2
231  %cv2e2 = zext i4 %v2e2 to i32
232  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
233
234  %v1e3 = extractelement <8 x i4> %vec1, i64 3
235  %cv1e3 = zext i4 %v1e3 to i32
236  %v2e3 = extractelement <8 x i4> %vec2, i64 3
237  %cv2e3 = zext i4 %v2e3 to i32
238  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
239
240  %v1e4 = extractelement <8 x i4> %vec1, i64 4
241  %cv1e4 = zext i4 %v1e4 to i32
242  %v2e4 = extractelement <8 x i4> %vec2, i64 4
243  %cv2e4 = zext i4 %v2e4 to i32
244  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
245
246  %v1e5 = extractelement <8 x i4> %vec1, i64 5
247  %cv1e5 = zext i4 %v1e5 to i32
248  %v2e5 = extractelement <8 x i4> %vec2, i64 5
249  %cv2e5 = zext i4 %v2e5 to i32
250  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
251
252  %v1e6 = extractelement <8 x i4> %vec1, i64 6
253  %cv1e6 = zext i4 %v1e6 to i32
254  %v2e6 = extractelement <8 x i4> %vec2, i64 6
255  %cv2e6 = zext i4 %v2e6 to i32
256  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
257
258  %v1e7 = extractelement <8 x i4> %vec1, i64 7
259  %cv1e7 = zext i4 %v1e7 to i32
260  %v2e7 = extractelement <8 x i4> %vec2, i64 7
261  %cv2e7 = zext i4 %v2e7 to i32
262  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
263
264  %acc = load i32, i32 addrspace(1)* %dst, align 4
265  %add1 = add i32 %mul0, %acc
266  %add2 = add i32 %add1, %mul1
267  %add3 = add i32 %add2, %mul2
268  %add4 = add i32 %add3, %mul3
269  %add5 = add i32 %add4, %mul4
270  %add6 = add i32 %add5, %mul5
271  %add7 = add i32 %add6, %mul6
272  %add8 = add i32 %add7, %mul7
273
274  store i32 %add8, i32 addrspace(1)* %dst, align 4
275  ret void
276}
277
278; TODO: Remove the unnecessary instruction(that is zero-extending the
279; 2nd MAD) to have the pattern-recognizer to kick in.
280define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
281; GFX7-LABEL: udot8_acc16:
282; GFX7:       ; %bb.0: ; %entry
283; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
284; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
285; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
286; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
287; GFX7-NEXT:    s_mov_b32 s14, -1
288; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
289; GFX7-NEXT:    s_add_u32 s12, s12, s3
290; GFX7-NEXT:    s_mov_b32 s3, 0xf000
291; GFX7-NEXT:    s_mov_b32 s10, 0
292; GFX7-NEXT:    s_mov_b32 s11, s3
293; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
295; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
296; GFX7-NEXT:    v_mov_b32_e32 v1, 0
297; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
298; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
299; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
300; GFX7-NEXT:    s_mov_b32 s2, -1
301; GFX7-NEXT:    buffer_load_ushort v16, off, s[0:3], 0
302; GFX7-NEXT:    s_addc_u32 s13, s13, 0
303; GFX7-NEXT:    s_waitcnt vmcnt(2)
304; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
305; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
306; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
307; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
308; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
309; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
310; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
311; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
312; GFX7-NEXT:    s_waitcnt vmcnt(1)
313; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
314; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
315; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
316; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
317; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
318; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
319; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
320; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
321; GFX7-NEXT:    s_waitcnt vmcnt(0)
322; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
323; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
324; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
325; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
326; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
327; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
328; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
329; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
330; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
331; GFX7-NEXT:    s_endpgm
332;
333; GFX8-LABEL: udot8_acc16:
334; GFX8:       ; %bb.0: ; %entry
335; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
336; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
337; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
338; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
339; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
340; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX8-NEXT:    v_mov_b32_e32 v1, s5
342; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
343; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
344; GFX8-NEXT:    flat_load_dword v4, v[0:1]
345; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
346; GFX8-NEXT:    v_mov_b32_e32 v1, s7
347; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
348; GFX8-NEXT:    v_mov_b32_e32 v3, s1
349; GFX8-NEXT:    flat_load_dword v0, v[0:1]
350; GFX8-NEXT:    v_mov_b32_e32 v2, s0
351; GFX8-NEXT:    flat_load_ushort v18, v[2:3]
352; GFX8-NEXT:    s_mov_b32 s10, -1
353; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
354; GFX8-NEXT:    s_add_u32 s8, s8, s3
355; GFX8-NEXT:    s_addc_u32 s9, s9, 0
356; GFX8-NEXT:    s_waitcnt vmcnt(2)
357; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v4
358; GFX8-NEXT:    v_bfe_u32 v5, v4, 24, 4
359; GFX8-NEXT:    v_bfe_u32 v6, v4, 20, 4
360; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 4
361; GFX8-NEXT:    v_bfe_u32 v8, v4, 12, 4
362; GFX8-NEXT:    v_bfe_u32 v9, v4, 8, 4
363; GFX8-NEXT:    v_bfe_u32 v10, v4, 4, 4
364; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
365; GFX8-NEXT:    s_waitcnt vmcnt(1)
366; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
367; GFX8-NEXT:    v_bfe_u32 v12, v0, 24, 4
368; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
369; GFX8-NEXT:    v_bfe_u32 v14, v0, 16, 4
370; GFX8-NEXT:    v_bfe_u32 v15, v0, 12, 4
371; GFX8-NEXT:    v_bfe_u32 v16, v0, 8, 4
372; GFX8-NEXT:    v_bfe_u32 v17, v0, 4, 4
373; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
374; GFX8-NEXT:    s_waitcnt vmcnt(0)
375; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v18
376; GFX8-NEXT:    v_mad_u16 v0, v10, v17, v0
377; GFX8-NEXT:    v_mad_u16 v0, v9, v16, v0
378; GFX8-NEXT:    v_mad_u16 v0, v8, v15, v0
379; GFX8-NEXT:    v_mad_u16 v0, v7, v14, v0
380; GFX8-NEXT:    v_mad_u16 v0, v6, v13, v0
381; GFX8-NEXT:    v_mad_u16 v0, v5, v12, v0
382; GFX8-NEXT:    v_mad_u16 v0, v1, v11, v0
383; GFX8-NEXT:    flat_store_short v[2:3], v0
384; GFX8-NEXT:    s_endpgm
385;
386; GFX9-LABEL: udot8_acc16:
387; GFX9:       ; %bb.0: ; %entry
388; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
389; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
390; GFX9-NEXT:    s_mov_b32 s10, -1
391; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
392; GFX9-NEXT:    s_add_u32 s8, s8, s3
393; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
394; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
395; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
396; GFX9-NEXT:    v_mov_b32_e32 v1, 0
397; GFX9-NEXT:    s_addc_u32 s9, s9, 0
398; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
400; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
401; GFX9-NEXT:    global_load_ushort v17, v1, s[2:3]
402; GFX9-NEXT:    s_waitcnt vmcnt(2)
403; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
404; GFX9-NEXT:    v_bfe_u32 v4, v2, 24, 4
405; GFX9-NEXT:    v_bfe_u32 v5, v2, 20, 4
406; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 4
407; GFX9-NEXT:    v_bfe_u32 v7, v2, 12, 4
408; GFX9-NEXT:    v_bfe_u32 v8, v2, 8, 4
409; GFX9-NEXT:    v_bfe_u32 v9, v2, 4, 4
410; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
411; GFX9-NEXT:    s_waitcnt vmcnt(1)
412; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
413; GFX9-NEXT:    v_bfe_u32 v11, v3, 24, 4
414; GFX9-NEXT:    v_bfe_u32 v12, v3, 20, 4
415; GFX9-NEXT:    v_bfe_u32 v13, v3, 16, 4
416; GFX9-NEXT:    v_bfe_u32 v14, v3, 12, 4
417; GFX9-NEXT:    v_bfe_u32 v15, v3, 8, 4
418; GFX9-NEXT:    v_bfe_u32 v16, v3, 4, 4
419; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
420; GFX9-NEXT:    s_waitcnt vmcnt(0)
421; GFX9-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
422; GFX9-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
423; GFX9-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
424; GFX9-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
425; GFX9-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
426; GFX9-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
427; GFX9-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
428; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
429; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
430; GFX9-NEXT:    s_endpgm
431;
432; GFX9-DL-LABEL: udot8_acc16:
433; GFX9-DL:       ; %bb.0: ; %entry
434; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
435; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
436; GFX9-DL-NEXT:    s_mov_b32 s10, -1
437; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
438; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
439; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
440; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
441; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
442; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
443; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
444; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
446; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
447; GFX9-DL-NEXT:    global_load_ushort v17, v1, s[2:3]
448; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
449; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
450; GFX9-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
451; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 20, 4
452; GFX9-DL-NEXT:    v_bfe_u32 v6, v2, 16, 4
453; GFX9-DL-NEXT:    v_bfe_u32 v7, v2, 12, 4
454; GFX9-DL-NEXT:    v_bfe_u32 v8, v2, 8, 4
455; GFX9-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
456; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
457; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
458; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
459; GFX9-DL-NEXT:    v_bfe_u32 v11, v3, 24, 4
460; GFX9-DL-NEXT:    v_bfe_u32 v12, v3, 20, 4
461; GFX9-DL-NEXT:    v_bfe_u32 v13, v3, 16, 4
462; GFX9-DL-NEXT:    v_bfe_u32 v14, v3, 12, 4
463; GFX9-DL-NEXT:    v_bfe_u32 v15, v3, 8, 4
464; GFX9-DL-NEXT:    v_bfe_u32 v16, v3, 4, 4
465; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
466; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
467; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
468; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
469; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
470; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
471; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
472; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
473; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
474; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
475; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
476; GFX9-DL-NEXT:    s_endpgm
477;
478; GFX10-DL-LABEL: udot8_acc16:
479; GFX10-DL:       ; %bb.0: ; %entry
480; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
481; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
482; GFX10-DL-NEXT:    s_mov_b32 s10, -1
483; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
484; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
485; GFX10-DL-NEXT:    s_clause 0x1
486; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
487; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
488; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
489; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
490; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
491; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX10-DL-NEXT:    s_clause 0x1
493; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
494; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
495; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
496; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
497; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
498; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
499; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
500; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
501; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
502; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
503; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
504; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
505; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
506; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
507; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
508; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
509; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
510; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
511; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
512; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
513; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
514; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
515; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
516; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
517; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
518; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
519; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
520; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
521; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
522; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
523; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
524; GFX10-DL-NEXT:    s_endpgm
525                                       <8 x i4> addrspace(1)* %src2,
526                                       i16 addrspace(1)* nocapture %dst) {
527entry:
528  %idx = call i32 @llvm.amdgcn.workitem.id.x()
529  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
530  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
531  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
532  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
533
534  %v1e0 = extractelement <8 x i4> %vec1, i64 0
535  %cv1e0 = zext i4 %v1e0 to i16
536  %v2e0 = extractelement <8 x i4> %vec2, i64 0
537  %cv2e0 = zext i4 %v2e0 to i16
538  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
539
540  %v1e1 = extractelement <8 x i4> %vec1, i64 1
541  %cv1e1 = zext i4 %v1e1 to i16
542  %v2e1 = extractelement <8 x i4> %vec2, i64 1
543  %cv2e1 = zext i4 %v2e1 to i16
544  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
545
546  %v1e2 = extractelement <8 x i4> %vec1, i64 2
547  %cv1e2 = zext i4 %v1e2 to i16
548  %v2e2 = extractelement <8 x i4> %vec2, i64 2
549  %cv2e2 = zext i4 %v2e2 to i16
550  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
551
552  %v1e3 = extractelement <8 x i4> %vec1, i64 3
553  %cv1e3 = zext i4 %v1e3 to i16
554  %v2e3 = extractelement <8 x i4> %vec2, i64 3
555  %cv2e3 = zext i4 %v2e3 to i16
556  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
557
558  %v1e4 = extractelement <8 x i4> %vec1, i64 4
559  %cv1e4 = zext i4 %v1e4 to i16
560  %v2e4 = extractelement <8 x i4> %vec2, i64 4
561  %cv2e4 = zext i4 %v2e4 to i16
562  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
563
564  %v1e5 = extractelement <8 x i4> %vec1, i64 5
565  %cv1e5 = zext i4 %v1e5 to i16
566  %v2e5 = extractelement <8 x i4> %vec2, i64 5
567  %cv2e5 = zext i4 %v2e5 to i16
568  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
569
570  %v1e6 = extractelement <8 x i4> %vec1, i64 6
571  %cv1e6 = zext i4 %v1e6 to i16
572  %v2e6 = extractelement <8 x i4> %vec2, i64 6
573  %cv2e6 = zext i4 %v2e6 to i16
574  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
575
576  %v1e7 = extractelement <8 x i4> %vec1, i64 7
577  %cv1e7 = zext i4 %v1e7 to i16
578  %v2e7 = extractelement <8 x i4> %vec2, i64 7
579  %cv2e7 = zext i4 %v2e7 to i16
580  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
581
582  %acc = load i16, i16 addrspace(1)* %dst, align 4
583  %add1 = add i16 %mul0, %acc
584  %add2 = add i16 %add1, %mul1
585  %add3 = add i16 %add2, %mul2
586  %add4 = add i16 %add3, %mul3
587  %add5 = add i16 %add4, %mul4
588  %add6 = add i16 %add5, %mul5
589  %add7 = add i16 %add6, %mul6
590  %add8 = add i16 %add7, %mul7
591
592  store i16 %add8, i16 addrspace(1)* %dst, align 4
593  ret void
594}
595
596; TODO: Remove the unnecessary instruction(that is zero-extending the
597; 2nd MAD) to have the pattern-recognizer to kick in.
598define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
599; GFX7-LABEL: udot8_acc8:
600; GFX7:       ; %bb.0: ; %entry
601; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
602; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
603; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
604; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
605; GFX7-NEXT:    s_mov_b32 s14, -1
606; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
607; GFX7-NEXT:    s_add_u32 s12, s12, s3
608; GFX7-NEXT:    s_mov_b32 s3, 0xf000
609; GFX7-NEXT:    s_mov_b32 s10, 0
610; GFX7-NEXT:    s_mov_b32 s11, s3
611; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
613; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
614; GFX7-NEXT:    v_mov_b32_e32 v1, 0
615; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
616; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
617; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
618; GFX7-NEXT:    s_mov_b32 s2, -1
619; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
620; GFX7-NEXT:    s_addc_u32 s13, s13, 0
621; GFX7-NEXT:    s_waitcnt vmcnt(2)
622; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
623; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
624; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
625; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
626; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
627; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
628; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
629; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
630; GFX7-NEXT:    s_waitcnt vmcnt(1)
631; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
632; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
633; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
634; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
635; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
636; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
637; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
638; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
639; GFX7-NEXT:    s_waitcnt vmcnt(0)
640; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
641; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
642; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
643; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
644; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
645; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
646; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
647; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
648; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
649; GFX7-NEXT:    s_endpgm
650;
651; GFX8-LABEL: udot8_acc8:
652; GFX8:       ; %bb.0: ; %entry
653; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
654; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
655; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
656; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
657; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
658; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX8-NEXT:    v_mov_b32_e32 v1, s5
660; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
661; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
662; GFX8-NEXT:    flat_load_dword v4, v[0:1]
663; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
664; GFX8-NEXT:    v_mov_b32_e32 v1, s7
665; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
666; GFX8-NEXT:    v_mov_b32_e32 v3, s1
667; GFX8-NEXT:    flat_load_dword v0, v[0:1]
668; GFX8-NEXT:    v_mov_b32_e32 v2, s0
669; GFX8-NEXT:    flat_load_ubyte v18, v[2:3]
670; GFX8-NEXT:    s_mov_b32 s10, -1
671; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
672; GFX8-NEXT:    s_add_u32 s8, s8, s3
673; GFX8-NEXT:    s_addc_u32 s9, s9, 0
674; GFX8-NEXT:    s_waitcnt vmcnt(2)
675; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v4
676; GFX8-NEXT:    v_bfe_u32 v5, v4, 24, 4
677; GFX8-NEXT:    v_bfe_u32 v6, v4, 20, 4
678; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 4
679; GFX8-NEXT:    v_bfe_u32 v8, v4, 12, 4
680; GFX8-NEXT:    v_bfe_u32 v9, v4, 8, 4
681; GFX8-NEXT:    v_bfe_u32 v10, v4, 4, 4
682; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
683; GFX8-NEXT:    s_waitcnt vmcnt(1)
684; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
685; GFX8-NEXT:    v_bfe_u32 v12, v0, 24, 4
686; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
687; GFX8-NEXT:    v_bfe_u32 v14, v0, 16, 4
688; GFX8-NEXT:    v_bfe_u32 v15, v0, 12, 4
689; GFX8-NEXT:    v_bfe_u32 v16, v0, 8, 4
690; GFX8-NEXT:    v_bfe_u32 v17, v0, 4, 4
691; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
692; GFX8-NEXT:    s_waitcnt vmcnt(0)
693; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v18
694; GFX8-NEXT:    v_mad_u16 v0, v10, v17, v0
695; GFX8-NEXT:    v_mad_u16 v0, v9, v16, v0
696; GFX8-NEXT:    v_mad_u16 v0, v8, v15, v0
697; GFX8-NEXT:    v_mad_u16 v0, v7, v14, v0
698; GFX8-NEXT:    v_mad_u16 v0, v6, v13, v0
699; GFX8-NEXT:    v_mad_u16 v0, v5, v12, v0
700; GFX8-NEXT:    v_mad_u16 v0, v1, v11, v0
701; GFX8-NEXT:    flat_store_byte v[2:3], v0
702; GFX8-NEXT:    s_endpgm
703;
704; GFX9-LABEL: udot8_acc8:
705; GFX9:       ; %bb.0: ; %entry
706; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
707; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
708; GFX9-NEXT:    s_mov_b32 s10, -1
709; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
710; GFX9-NEXT:    s_add_u32 s8, s8, s3
711; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
712; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
713; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
714; GFX9-NEXT:    v_mov_b32_e32 v1, 0
715; GFX9-NEXT:    s_addc_u32 s9, s9, 0
716; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
718; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
719; GFX9-NEXT:    global_load_ubyte v17, v1, s[2:3]
720; GFX9-NEXT:    s_waitcnt vmcnt(2)
721; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
722; GFX9-NEXT:    v_bfe_u32 v4, v2, 24, 4
723; GFX9-NEXT:    v_bfe_u32 v5, v2, 20, 4
724; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 4
725; GFX9-NEXT:    v_bfe_u32 v7, v2, 12, 4
726; GFX9-NEXT:    v_bfe_u32 v8, v2, 8, 4
727; GFX9-NEXT:    v_bfe_u32 v9, v2, 4, 4
728; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
729; GFX9-NEXT:    s_waitcnt vmcnt(1)
730; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
731; GFX9-NEXT:    v_bfe_u32 v11, v3, 24, 4
732; GFX9-NEXT:    v_bfe_u32 v12, v3, 20, 4
733; GFX9-NEXT:    v_bfe_u32 v13, v3, 16, 4
734; GFX9-NEXT:    v_bfe_u32 v14, v3, 12, 4
735; GFX9-NEXT:    v_bfe_u32 v15, v3, 8, 4
736; GFX9-NEXT:    v_bfe_u32 v16, v3, 4, 4
737; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
738; GFX9-NEXT:    s_waitcnt vmcnt(0)
739; GFX9-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
740; GFX9-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
741; GFX9-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
742; GFX9-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
743; GFX9-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
744; GFX9-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
745; GFX9-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
746; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
747; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
748; GFX9-NEXT:    s_endpgm
749;
750; GFX9-DL-LABEL: udot8_acc8:
751; GFX9-DL:       ; %bb.0: ; %entry
752; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
753; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
754; GFX9-DL-NEXT:    s_mov_b32 s10, -1
755; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
756; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
757; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
758; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
759; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
760; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
761; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
762; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
763; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
764; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
765; GFX9-DL-NEXT:    global_load_ubyte v17, v1, s[2:3]
766; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
767; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
768; GFX9-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
769; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 20, 4
770; GFX9-DL-NEXT:    v_bfe_u32 v6, v2, 16, 4
771; GFX9-DL-NEXT:    v_bfe_u32 v7, v2, 12, 4
772; GFX9-DL-NEXT:    v_bfe_u32 v8, v2, 8, 4
773; GFX9-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
774; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
775; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
776; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
777; GFX9-DL-NEXT:    v_bfe_u32 v11, v3, 24, 4
778; GFX9-DL-NEXT:    v_bfe_u32 v12, v3, 20, 4
779; GFX9-DL-NEXT:    v_bfe_u32 v13, v3, 16, 4
780; GFX9-DL-NEXT:    v_bfe_u32 v14, v3, 12, 4
781; GFX9-DL-NEXT:    v_bfe_u32 v15, v3, 8, 4
782; GFX9-DL-NEXT:    v_bfe_u32 v16, v3, 4, 4
783; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
784; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
785; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
786; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
787; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
788; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
789; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
790; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
791; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
792; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
793; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
794; GFX9-DL-NEXT:    s_endpgm
795;
796; GFX10-DL-LABEL: udot8_acc8:
797; GFX10-DL:       ; %bb.0: ; %entry
798; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
799; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
800; GFX10-DL-NEXT:    s_mov_b32 s10, -1
801; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
802; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
803; GFX10-DL-NEXT:    s_clause 0x1
804; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
805; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
806; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
807; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
808; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
809; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX10-DL-NEXT:    s_clause 0x1
811; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
812; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
813; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
814; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
815; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
816; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
817; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
818; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
819; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
820; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
821; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
822; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
823; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
824; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
825; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
826; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
827; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
828; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
829; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
830; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
831; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
832; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
833; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
834; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
835; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
836; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
837; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
838; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
839; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
840; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
841; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
842; GFX10-DL-NEXT:    s_endpgm
843                                      <8 x i4> addrspace(1)* %src2,
844                                      i8 addrspace(1)* nocapture %dst) {
845entry:
846  %idx = call i32 @llvm.amdgcn.workitem.id.x()
847  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
848  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
849  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
850  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
851
852  %v1e0 = extractelement <8 x i4> %vec1, i64 0
853  %cv1e0 = zext i4 %v1e0 to i8
854  %v2e0 = extractelement <8 x i4> %vec2, i64 0
855  %cv2e0 = zext i4 %v2e0 to i8
856  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
857
858  %v1e1 = extractelement <8 x i4> %vec1, i64 1
859  %cv1e1 = zext i4 %v1e1 to i8
860  %v2e1 = extractelement <8 x i4> %vec2, i64 1
861  %cv2e1 = zext i4 %v2e1 to i8
862  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
863
864  %v1e2 = extractelement <8 x i4> %vec1, i64 2
865  %cv1e2 = zext i4 %v1e2 to i8
866  %v2e2 = extractelement <8 x i4> %vec2, i64 2
867  %cv2e2 = zext i4 %v2e2 to i8
868  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
869
870  %v1e3 = extractelement <8 x i4> %vec1, i64 3
871  %cv1e3 = zext i4 %v1e3 to i8
872  %v2e3 = extractelement <8 x i4> %vec2, i64 3
873  %cv2e3 = zext i4 %v2e3 to i8
874  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
875
876  %v1e4 = extractelement <8 x i4> %vec1, i64 4
877  %cv1e4 = zext i4 %v1e4 to i8
878  %v2e4 = extractelement <8 x i4> %vec2, i64 4
879  %cv2e4 = zext i4 %v2e4 to i8
880  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
881
882  %v1e5 = extractelement <8 x i4> %vec1, i64 5
883  %cv1e5 = zext i4 %v1e5 to i8
884  %v2e5 = extractelement <8 x i4> %vec2, i64 5
885  %cv2e5 = zext i4 %v2e5 to i8
886  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
887
888  %v1e6 = extractelement <8 x i4> %vec1, i64 6
889  %cv1e6 = zext i4 %v1e6 to i8
890  %v2e6 = extractelement <8 x i4> %vec2, i64 6
891  %cv2e6 = zext i4 %v2e6 to i8
892  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
893
894  %v1e7 = extractelement <8 x i4> %vec1, i64 7
895  %cv1e7 = zext i4 %v1e7 to i8
896  %v2e7 = extractelement <8 x i4> %vec2, i64 7
897  %cv2e7 = zext i4 %v2e7 to i8
898  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
899
900  %acc = load i8, i8 addrspace(1)* %dst, align 4
901  %add1 = add i8 %mul0, %acc
902  %add2 = add i8 %add1, %mul1
903  %add3 = add i8 %add2, %mul2
904  %add4 = add i8 %add3, %mul3
905  %add5 = add i8 %add4, %mul4
906  %add6 = add i8 %add5, %mul5
907  %add7 = add i8 %add6, %mul6
908  %add8 = add i8 %add7, %mul7
909
910  store i8 %add8, i8 addrspace(1)* %dst, align 4
911  ret void
912}
913
914; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD)
915; to have the pattern-recognizer to kick in.
916define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
917; GFX7-LABEL: udot8_acc4:
918; GFX7:       ; %bb.0: ; %entry
919; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
920; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
921; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
922; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
923; GFX7-NEXT:    s_mov_b32 s14, -1
924; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
925; GFX7-NEXT:    s_add_u32 s12, s12, s3
926; GFX7-NEXT:    s_mov_b32 s3, 0xf000
927; GFX7-NEXT:    s_mov_b32 s10, 0
928; GFX7-NEXT:    s_mov_b32 s11, s3
929; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
930; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
931; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
932; GFX7-NEXT:    v_mov_b32_e32 v1, 0
933; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
934; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
935; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
936; GFX7-NEXT:    s_mov_b32 s2, -1
937; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
938; GFX7-NEXT:    s_addc_u32 s13, s13, 0
939; GFX7-NEXT:    s_waitcnt vmcnt(2)
940; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
941; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
942; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
943; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
944; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
945; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
946; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
947; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
948; GFX7-NEXT:    s_waitcnt vmcnt(1)
949; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
950; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
951; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
952; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
953; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
954; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
955; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
956; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
957; GFX7-NEXT:    s_waitcnt vmcnt(0)
958; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
959; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
960; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
961; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
962; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
963; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
964; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
965; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
966; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
967; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
968; GFX7-NEXT:    s_endpgm
969;
970; GFX8-LABEL: udot8_acc4:
971; GFX8:       ; %bb.0: ; %entry
972; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
973; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
974; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
975; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
976; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
977; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
978; GFX8-NEXT:    v_mov_b32_e32 v1, s5
979; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
980; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
981; GFX8-NEXT:    flat_load_dword v4, v[0:1]
982; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
983; GFX8-NEXT:    v_mov_b32_e32 v1, s7
984; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
985; GFX8-NEXT:    v_mov_b32_e32 v3, s1
986; GFX8-NEXT:    flat_load_dword v0, v[0:1]
987; GFX8-NEXT:    v_mov_b32_e32 v2, s0
988; GFX8-NEXT:    flat_load_ubyte v18, v[2:3]
989; GFX8-NEXT:    s_mov_b32 s10, -1
990; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
991; GFX8-NEXT:    s_add_u32 s8, s8, s3
992; GFX8-NEXT:    s_addc_u32 s9, s9, 0
993; GFX8-NEXT:    s_waitcnt vmcnt(2)
994; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v4
995; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
996; GFX8-NEXT:    v_bfe_u32 v6, v4, 20, 4
997; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 4
998; GFX8-NEXT:    v_bfe_u32 v8, v4, 12, 4
999; GFX8-NEXT:    v_bfe_u32 v9, v4, 8, 4
1000; GFX8-NEXT:    v_bfe_u32 v10, v4, 4, 4
1001; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
1002; GFX8-NEXT:    s_waitcnt vmcnt(1)
1003; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
1004; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
1005; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
1006; GFX8-NEXT:    v_bfe_u32 v14, v0, 16, 4
1007; GFX8-NEXT:    v_bfe_u32 v15, v0, 12, 4
1008; GFX8-NEXT:    v_bfe_u32 v16, v0, 8, 4
1009; GFX8-NEXT:    v_bfe_u32 v17, v0, 4, 4
1010; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1011; GFX8-NEXT:    s_waitcnt vmcnt(0)
1012; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v18
1013; GFX8-NEXT:    v_mad_u16 v0, v10, v17, v0
1014; GFX8-NEXT:    v_mad_u16 v0, v9, v16, v0
1015; GFX8-NEXT:    v_mad_u16 v0, v8, v15, v0
1016; GFX8-NEXT:    v_mad_u16 v0, v7, v14, v0
1017; GFX8-NEXT:    v_mad_u16 v0, v6, v13, v0
1018; GFX8-NEXT:    v_mad_u16 v0, v5, v12, v0
1019; GFX8-NEXT:    v_mad_u16 v0, v1, v11, v0
1020; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1021; GFX8-NEXT:    flat_store_byte v[2:3], v0
1022; GFX8-NEXT:    s_endpgm
1023;
1024; GFX9-LABEL: udot8_acc4:
1025; GFX9:       ; %bb.0: ; %entry
1026; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1027; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1028; GFX9-NEXT:    s_mov_b32 s10, -1
1029; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1030; GFX9-NEXT:    s_add_u32 s8, s8, s3
1031; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1032; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1033; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1034; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1035; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1036; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1037; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
1038; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
1039; GFX9-NEXT:    global_load_ubyte v17, v1, s[2:3]
1040; GFX9-NEXT:    s_waitcnt vmcnt(2)
1041; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
1042; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1043; GFX9-NEXT:    v_bfe_u32 v5, v2, 20, 4
1044; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 4
1045; GFX9-NEXT:    v_bfe_u32 v7, v2, 12, 4
1046; GFX9-NEXT:    v_bfe_u32 v8, v2, 8, 4
1047; GFX9-NEXT:    v_bfe_u32 v9, v2, 4, 4
1048; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1049; GFX9-NEXT:    s_waitcnt vmcnt(1)
1050; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
1051; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
1052; GFX9-NEXT:    v_bfe_u32 v12, v3, 20, 4
1053; GFX9-NEXT:    v_bfe_u32 v13, v3, 16, 4
1054; GFX9-NEXT:    v_bfe_u32 v14, v3, 12, 4
1055; GFX9-NEXT:    v_bfe_u32 v15, v3, 8, 4
1056; GFX9-NEXT:    v_bfe_u32 v16, v3, 4, 4
1057; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
1058; GFX9-NEXT:    s_waitcnt vmcnt(0)
1059; GFX9-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
1060; GFX9-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
1061; GFX9-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
1062; GFX9-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
1063; GFX9-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
1064; GFX9-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
1065; GFX9-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
1066; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
1067; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
1068; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
1069; GFX9-NEXT:    s_endpgm
1070;
1071; GFX9-DL-LABEL: udot8_acc4:
1072; GFX9-DL:       ; %bb.0: ; %entry
1073; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1074; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1075; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1076; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1077; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1078; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1079; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1080; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1081; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
1082; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1083; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1084; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1085; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1086; GFX9-DL-NEXT:    global_load_ubyte v17, v1, s[2:3]
1087; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1088; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
1089; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1090; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 20, 4
1091; GFX9-DL-NEXT:    v_bfe_u32 v6, v2, 16, 4
1092; GFX9-DL-NEXT:    v_bfe_u32 v7, v2, 12, 4
1093; GFX9-DL-NEXT:    v_bfe_u32 v8, v2, 8, 4
1094; GFX9-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
1095; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1096; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1097; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
1098; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
1099; GFX9-DL-NEXT:    v_bfe_u32 v12, v3, 20, 4
1100; GFX9-DL-NEXT:    v_bfe_u32 v13, v3, 16, 4
1101; GFX9-DL-NEXT:    v_bfe_u32 v14, v3, 12, 4
1102; GFX9-DL-NEXT:    v_bfe_u32 v15, v3, 8, 4
1103; GFX9-DL-NEXT:    v_bfe_u32 v16, v3, 4, 4
1104; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
1105; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1106; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
1107; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
1108; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
1109; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
1110; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
1111; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
1112; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
1113; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
1114; GFX9-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1115; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1116; GFX9-DL-NEXT:    s_endpgm
1117;
1118; GFX10-DL-LABEL: udot8_acc4:
1119; GFX10-DL:       ; %bb.0: ; %entry
1120; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1121; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1122; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1123; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1124; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1125; GFX10-DL-NEXT:    s_clause 0x1
1126; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1127; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1128; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1129; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1130; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1131; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1132; GFX10-DL-NEXT:    s_clause 0x1
1133; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1134; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1135; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
1136; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1137; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
1138; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1139; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
1140; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
1141; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
1142; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1143; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
1144; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
1145; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
1146; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1147; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
1148; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
1149; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1150; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
1151; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
1152; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1153; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
1154; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
1155; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1156; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1157; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
1158; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
1159; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
1160; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1161; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1162; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
1163; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1164; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1165; GFX10-DL-NEXT:    s_endpgm
1166                                      <8 x i4> addrspace(1)* %src2,
1167                                      i4 addrspace(1)* nocapture %dst) {
1168entry:
1169  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1170  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1171  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1172  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1173  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1174
1175  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1176  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1177  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1178
1179  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1180  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1181  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1182
1183  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1184  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1185  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1186
1187  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1188  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1189  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1190
1191  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1192  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1193  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1194
1195  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1196  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1197  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1198
1199  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1200  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1201  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1202
1203  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1204  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1205  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1206
1207  %acc = load i4, i4 addrspace(1)* %dst, align 4
1208  %add1 = add i4 %mul0, %acc
1209  %add2 = add i4 %add1, %mul1
1210  %add3 = add i4 %add2, %mul2
1211  %add4 = add i4 %add3, %mul3
1212  %add5 = add i4 %add4, %mul4
1213  %add6 = add i4 %add5, %mul5
1214  %add7 = add i4 %add6, %mul6
1215  %add8 = add i4 %add7, %mul7
1216
1217  store i4 %add8, i4 addrspace(1)* %dst, align 4
1218  ret void
1219}
1220
1221; TODO: Currently, permutation of udot8 is turned off due to a huge increase
1222; in the compile time.
1223define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
1224; GFX7-LABEL: udot8_CommutationInsideMAD:
1225; GFX7:       ; %bb.0: ; %entry
1226; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1227; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1228; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1229; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1230; GFX7-NEXT:    s_mov_b32 s14, -1
1231; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1232; GFX7-NEXT:    s_add_u32 s12, s12, s3
1233; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1234; GFX7-NEXT:    s_mov_b32 s10, 0
1235; GFX7-NEXT:    s_mov_b32 s11, s3
1236; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1237; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1238; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1239; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1240; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1241; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1242; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1243; GFX7-NEXT:    s_mov_b32 s2, -1
1244; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
1245; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1246; GFX7-NEXT:    s_waitcnt vmcnt(2)
1247; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
1248; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
1249; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
1250; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
1251; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
1252; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
1253; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
1254; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1255; GFX7-NEXT:    s_waitcnt vmcnt(1)
1256; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1257; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
1258; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
1259; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
1260; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
1261; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
1262; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
1263; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1264; GFX7-NEXT:    s_waitcnt vmcnt(0)
1265; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
1266; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1267; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1268; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1269; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1270; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1271; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
1272; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
1273; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1274; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1275; GFX7-NEXT:    s_endpgm
1276;
1277; GFX8-LABEL: udot8_CommutationInsideMAD:
1278; GFX8:       ; %bb.0: ; %entry
1279; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1280; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1281; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1282; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1283; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1284; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1286; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1287; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288; GFX8-NEXT:    flat_load_dword v4, v[0:1]
1289; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1290; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1291; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1292; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1293; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1294; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1295; GFX8-NEXT:    flat_load_ubyte v18, v[2:3]
1296; GFX8-NEXT:    s_mov_b32 s10, -1
1297; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1298; GFX8-NEXT:    s_add_u32 s8, s8, s3
1299; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1300; GFX8-NEXT:    s_waitcnt vmcnt(2)
1301; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v4
1302; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
1303; GFX8-NEXT:    v_bfe_u32 v6, v4, 20, 4
1304; GFX8-NEXT:    v_bfe_u32 v7, v4, 16, 4
1305; GFX8-NEXT:    v_bfe_u32 v8, v4, 12, 4
1306; GFX8-NEXT:    v_bfe_u32 v9, v4, 8, 4
1307; GFX8-NEXT:    v_bfe_u32 v10, v4, 4, 4
1308; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
1309; GFX8-NEXT:    s_waitcnt vmcnt(1)
1310; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
1311; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
1312; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
1313; GFX8-NEXT:    v_bfe_u32 v14, v0, 16, 4
1314; GFX8-NEXT:    v_bfe_u32 v15, v0, 12, 4
1315; GFX8-NEXT:    v_bfe_u32 v16, v0, 8, 4
1316; GFX8-NEXT:    v_bfe_u32 v17, v0, 4, 4
1317; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1318; GFX8-NEXT:    s_waitcnt vmcnt(0)
1319; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v18
1320; GFX8-NEXT:    v_mad_u16 v0, v10, v17, v0
1321; GFX8-NEXT:    v_mad_u16 v0, v9, v16, v0
1322; GFX8-NEXT:    v_mad_u16 v0, v8, v15, v0
1323; GFX8-NEXT:    v_mad_u16 v0, v7, v14, v0
1324; GFX8-NEXT:    v_mad_u16 v0, v6, v13, v0
1325; GFX8-NEXT:    v_mad_u16 v0, v5, v12, v0
1326; GFX8-NEXT:    v_mad_u16 v0, v1, v11, v0
1327; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1328; GFX8-NEXT:    flat_store_byte v[2:3], v0
1329; GFX8-NEXT:    s_endpgm
1330;
1331; GFX9-LABEL: udot8_CommutationInsideMAD:
1332; GFX9:       ; %bb.0: ; %entry
1333; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1334; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1335; GFX9-NEXT:    s_mov_b32 s10, -1
1336; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1337; GFX9-NEXT:    s_add_u32 s8, s8, s3
1338; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1339; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1340; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1341; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1342; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1343; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
1345; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
1346; GFX9-NEXT:    global_load_ubyte v17, v1, s[2:3]
1347; GFX9-NEXT:    s_waitcnt vmcnt(2)
1348; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
1349; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1350; GFX9-NEXT:    v_bfe_u32 v5, v2, 20, 4
1351; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 4
1352; GFX9-NEXT:    v_bfe_u32 v7, v2, 12, 4
1353; GFX9-NEXT:    v_bfe_u32 v8, v2, 8, 4
1354; GFX9-NEXT:    v_bfe_u32 v9, v2, 4, 4
1355; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1356; GFX9-NEXT:    s_waitcnt vmcnt(1)
1357; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
1358; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
1359; GFX9-NEXT:    v_bfe_u32 v12, v3, 20, 4
1360; GFX9-NEXT:    v_bfe_u32 v13, v3, 16, 4
1361; GFX9-NEXT:    v_bfe_u32 v14, v3, 12, 4
1362; GFX9-NEXT:    v_bfe_u32 v15, v3, 8, 4
1363; GFX9-NEXT:    v_bfe_u32 v16, v3, 4, 4
1364; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
1365; GFX9-NEXT:    s_waitcnt vmcnt(0)
1366; GFX9-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
1367; GFX9-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
1368; GFX9-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
1369; GFX9-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
1370; GFX9-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
1371; GFX9-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
1372; GFX9-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
1373; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
1374; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
1375; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
1376; GFX9-NEXT:    s_endpgm
1377;
1378; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
1379; GFX9-DL:       ; %bb.0: ; %entry
1380; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1381; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1382; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1383; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1384; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1385; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1386; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1387; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1388; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
1389; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1390; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1391; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1392; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1393; GFX9-DL-NEXT:    global_load_ubyte v17, v1, s[2:3]
1394; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1395; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v2
1396; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1397; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 20, 4
1398; GFX9-DL-NEXT:    v_bfe_u32 v6, v2, 16, 4
1399; GFX9-DL-NEXT:    v_bfe_u32 v7, v2, 12, 4
1400; GFX9-DL-NEXT:    v_bfe_u32 v8, v2, 8, 4
1401; GFX9-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
1402; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1403; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1404; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
1405; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 24, v3
1406; GFX9-DL-NEXT:    v_bfe_u32 v12, v3, 20, 4
1407; GFX9-DL-NEXT:    v_bfe_u32 v13, v3, 16, 4
1408; GFX9-DL-NEXT:    v_bfe_u32 v14, v3, 12, 4
1409; GFX9-DL-NEXT:    v_bfe_u32 v15, v3, 8, 4
1410; GFX9-DL-NEXT:    v_bfe_u32 v16, v3, 4, 4
1411; GFX9-DL-NEXT:    v_and_b32_e32 v3, 15, v3
1412; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1413; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v17
1414; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v9, v16, v2
1415; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v8, v15, v2
1416; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v7, v14, v2
1417; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v6, v13, v2
1418; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v5, v12, v2
1419; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v4, v11, v2
1420; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v10, v2
1421; GFX9-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1422; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1423; GFX9-DL-NEXT:    s_endpgm
1424;
1425; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
1426; GFX10-DL:       ; %bb.0: ; %entry
1427; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1428; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1429; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1430; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1431; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1432; GFX10-DL-NEXT:    s_clause 0x1
1433; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1434; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1435; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1436; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1437; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1438; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1439; GFX10-DL-NEXT:    s_clause 0x1
1440; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1441; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1442; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
1443; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1444; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
1445; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1446; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
1447; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
1448; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
1449; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1450; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
1451; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
1452; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
1453; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1454; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
1455; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
1456; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1457; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
1458; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
1459; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1460; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
1461; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
1462; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1463; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1464; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
1465; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
1466; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
1467; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1468; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1469; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
1470; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1471; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1472; GFX10-DL-NEXT:    s_endpgm
1473                                                      <8 x i4> addrspace(1)* %src2,
1474                                                      i4 addrspace(1)* nocapture %dst) {
1475entry:
1476  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1477  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1478  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1479  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1480  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1481
1482  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1483  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1484  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1485
1486  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1487  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1488  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1489
1490  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1491  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1492  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1493
1494  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1495  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1496  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1497
1498  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1499  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1500  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1501
1502  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1503  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1504  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1505
1506  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1507  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1508  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1509
1510  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1511  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1512  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1513
1514  %acc = load i4, i4 addrspace(1)* %dst, align 4
1515  %add1 = add i4 %mul0, %acc
1516  %add2 = add i4 %mul1, %add1
1517  %add3 = add i4 %mul2, %add2
1518  %add4 = add i4 %mul3, %add3
1519  %add5 = add i4 %mul4, %add4
1520  %add6 = add i4 %mul5, %add5
1521  %add7 = add i4 %mul6, %add6
1522  %add8 = add i4 %mul7, %add7
1523
1524  store i4 %add8, i4 addrspace(1)* %dst, align 4
1525  ret void
1526}
1527
1528define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1529; GFX7-LABEL: udot8_multiuses_mul1:
1530; GFX7:       ; %bb.0: ; %entry
1531; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1532; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1533; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1534; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1535; GFX7-NEXT:    s_mov_b32 s14, -1
1536; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1537; GFX7-NEXT:    s_add_u32 s12, s12, s3
1538; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1539; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1540; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1541; GFX7-NEXT:    s_mov_b32 s10, 0
1542; GFX7-NEXT:    s_mov_b32 s11, s3
1543; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1544; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1545; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1546; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1547; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1548; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1549; GFX7-NEXT:    s_mov_b32 s2, -1
1550; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1551; GFX7-NEXT:    s_waitcnt vmcnt(1)
1552; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
1553; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
1554; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
1555; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
1556; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
1557; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
1558; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
1559; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1560; GFX7-NEXT:    s_waitcnt vmcnt(0)
1561; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1562; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
1563; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
1564; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
1565; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
1566; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
1567; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
1568; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1569; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1570; GFX7-NEXT:    v_mad_u32_u24 v16, v2, v0, s4
1571; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
1572; GFX7-NEXT:    v_mad_u32_u24 v2, v8, v15, v16
1573; GFX7-NEXT:    v_mad_u32_u24 v2, v7, v14, v2
1574; GFX7-NEXT:    v_mad_u32_u24 v2, v6, v13, v2
1575; GFX7-NEXT:    v_mad_u32_u24 v2, v5, v12, v2
1576; GFX7-NEXT:    v_mad_u32_u24 v2, v4, v11, v2
1577; GFX7-NEXT:    v_mad_u32_u24 v2, v3, v10, v2
1578; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v9, v2
1579; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1580; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1581; GFX7-NEXT:    s_endpgm
1582;
1583; GFX8-LABEL: udot8_multiuses_mul1:
1584; GFX8:       ; %bb.0: ; %entry
1585; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1586; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1587; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1588; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1589; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1590; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1592; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1593; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1594; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1595; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1596; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1597; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1598; GFX8-NEXT:    flat_load_dword v1, v[2:3]
1599; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1600; GFX8-NEXT:    s_mov_b32 s10, -1
1601; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1602; GFX8-NEXT:    s_add_u32 s8, s8, s3
1603; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1604; GFX8-NEXT:    s_waitcnt vmcnt(1)
1605; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 28, v0
1606; GFX8-NEXT:    v_bfe_u32 v3, v0, 24, 4
1607; GFX8-NEXT:    v_bfe_u32 v4, v0, 20, 4
1608; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 4
1609; GFX8-NEXT:    v_bfe_u32 v6, v0, 12, 4
1610; GFX8-NEXT:    v_bfe_u32 v7, v0, 8, 4
1611; GFX8-NEXT:    v_bfe_u32 v8, v0, 4, 4
1612; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1613; GFX8-NEXT:    s_waitcnt vmcnt(0)
1614; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v1
1615; GFX8-NEXT:    v_bfe_u32 v10, v1, 24, 4
1616; GFX8-NEXT:    v_bfe_u32 v11, v1, 20, 4
1617; GFX8-NEXT:    v_bfe_u32 v12, v1, 16, 4
1618; GFX8-NEXT:    v_bfe_u32 v13, v1, 12, 4
1619; GFX8-NEXT:    v_bfe_u32 v14, v1, 8, 4
1620; GFX8-NEXT:    v_bfe_u32 v15, v1, 4, 4
1621; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
1622; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1623; GFX8-NEXT:    v_mad_u32_u24 v16, v0, v1, s2
1624; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v16
1625; GFX8-NEXT:    v_mad_u32_u24 v1, v8, v15, v16
1626; GFX8-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
1627; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
1628; GFX8-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
1629; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
1630; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
1631; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v9, v1
1632; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1633; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1634; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1635; GFX8-NEXT:    flat_store_dword v[0:1], v2
1636; GFX8-NEXT:    s_endpgm
1637;
1638; GFX9-LABEL: udot8_multiuses_mul1:
1639; GFX9:       ; %bb.0: ; %entry
1640; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1641; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1642; GFX9-NEXT:    s_mov_b32 s10, -1
1643; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1644; GFX9-NEXT:    s_add_u32 s8, s8, s3
1645; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1646; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1647; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1648; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1649; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1650; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1651; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1652; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
1653; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1654; GFX9-NEXT:    s_waitcnt vmcnt(1)
1655; GFX9-NEXT:    v_bfe_u32 v3, v1, 4, 4
1656; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1657; GFX9-NEXT:    s_waitcnt vmcnt(0)
1658; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1659; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
1660; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
1661; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
1662; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
1663; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
1664; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
1665; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
1666; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
1667; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
1668; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
1669; GFX9-NEXT:    v_bfe_u32 v10, v2, 4, 4
1670; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1671; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1672; GFX9-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
1673; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1674; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
1675; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
1676; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
1677; GFX9-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
1678; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
1679; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
1680; GFX9-NEXT:    v_add3_u32 v2, v2, v9, v8
1681; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
1682; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
1683; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v6
1684; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v4
1685; GFX9-NEXT:    v_add3_u32 v1, v17, v1, v2
1686; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
1687; GFX9-NEXT:    s_endpgm
1688;
1689; GFX9-DL-LABEL: udot8_multiuses_mul1:
1690; GFX9-DL:       ; %bb.0: ; %entry
1691; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1692; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1693; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1694; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1695; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1696; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1697; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1698; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1699; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1700; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1701; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1702; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1703; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1704; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1705; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1706; GFX9-DL-NEXT:    v_bfe_u32 v3, v1, 4, 4
1707; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1708; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1709; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1710; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
1711; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
1712; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
1713; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
1714; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
1715; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
1716; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
1717; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
1718; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
1719; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
1720; GFX9-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
1721; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1722; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1723; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
1724; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1725; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
1726; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
1727; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
1728; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
1729; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
1730; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
1731; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v9, v8
1732; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
1733; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
1734; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v7, v6
1735; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v5, v4
1736; GFX9-DL-NEXT:    v_add3_u32 v1, v17, v1, v2
1737; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1738; GFX9-DL-NEXT:    s_endpgm
1739;
1740; GFX10-DL-LABEL: udot8_multiuses_mul1:
1741; GFX10-DL:       ; %bb.0: ; %entry
1742; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1743; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1744; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1745; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1746; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1747; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1748; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1749; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1750; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1751; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX10-DL-NEXT:    s_clause 0x1
1753; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1754; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1755; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1756; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1757; GFX10-DL-NEXT:    v_and_b32_e32 v8, 15, v1
1758; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1759; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v2
1760; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
1761; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
1762; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 24, 4
1763; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
1764; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 16, 4
1765; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
1766; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 12, 4
1767; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 8, 4
1768; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
1769; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
1770; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1771; GFX10-DL-NEXT:    v_mad_u32_u24 v13, v8, v9, s2
1772; GFX10-DL-NEXT:    v_bfe_u32 v14, v2, 20, 4
1773; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
1774; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v11
1775; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v12
1776; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v10, v13
1777; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1778; GFX10-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
1779; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v15
1780; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v14
1781; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v7
1782; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v4, v2
1783; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v10
1784; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v3, v8, v9
1785; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v6, v5
1786; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
1787; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1788; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
1789; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
1790; GFX10-DL-NEXT:    s_endpgm
1791                                                <8 x i4> addrspace(1)* %src2,
1792                                                i32 addrspace(1)* nocapture %dst) {
1793entry:
1794  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1795  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1796  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1797  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1798  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1799
1800  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1801  %cv1e0 = zext i4 %v1e0 to i32
1802  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1803  %cv2e0 = zext i4 %v2e0 to i32
1804  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1805
1806  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1807  %cv1e1 = zext i4 %v1e1 to i32
1808  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1809  %cv2e1 = zext i4 %v2e1 to i32
1810  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1811
1812  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1813  %cv1e2 = zext i4 %v1e2 to i32
1814  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1815  %cv2e2 = zext i4 %v2e2 to i32
1816  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1817
1818  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1819  %cv1e3 = zext i4 %v1e3 to i32
1820  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1821  %cv2e3 = zext i4 %v2e3 to i32
1822  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1823
1824  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1825  %cv1e4 = zext i4 %v1e4 to i32
1826  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1827  %cv2e4 = zext i4 %v2e4 to i32
1828  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1829
1830  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1831  %cv1e5 = zext i4 %v1e5 to i32
1832  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1833  %cv2e5 = zext i4 %v2e5 to i32
1834  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1835
1836  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1837  %cv1e6 = zext i4 %v1e6 to i32
1838  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1839  %cv2e6 = zext i4 %v2e6 to i32
1840  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1841
1842  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1843  %cv1e7 = zext i4 %v1e7 to i32
1844  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1845  %cv2e7 = zext i4 %v2e7 to i32
1846  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1847
1848  %acc = load i32, i32 addrspace(1)* %dst, align 4
1849  %add1 = add i32 %mul0, %acc
1850  %add = add i32  %mul0, %add1
1851  %add2 = add i32 %add1, %mul1
1852  %add3 = add i32 %add2, %mul2
1853  %add4 = add i32 %add3, %mul3
1854  %add5 = add i32 %add4, %mul4
1855  %add6 = add i32 %add5, %mul5
1856  %add7 = add i32 %add6, %mul6
1857  %add8 = add i32 %add7, %mul7
1858
1859  %res = add i32 %add, %add8
1860  store i32 %res, i32 addrspace(1)* %dst, align 4
1861  ret void
1862}
1863
1864define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1865; GFX7-LABEL: udot8_acc32_vecMul:
1866; GFX7:       ; %bb.0: ; %entry
1867; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1868; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1869; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1870; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1871; GFX7-NEXT:    s_mov_b32 s14, -1
1872; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1873; GFX7-NEXT:    s_add_u32 s12, s12, s3
1874; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1875; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1876; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1877; GFX7-NEXT:    s_mov_b32 s10, 0
1878; GFX7-NEXT:    s_mov_b32 s11, s3
1879; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1880; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1881; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1882; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1883; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1884; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1885; GFX7-NEXT:    s_mov_b32 s2, -1
1886; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1887; GFX7-NEXT:    s_waitcnt vmcnt(1)
1888; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
1889; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
1890; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
1891; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
1892; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
1893; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
1894; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
1895; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1896; GFX7-NEXT:    s_waitcnt vmcnt(0)
1897; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1898; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
1899; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
1900; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
1901; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
1902; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
1903; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
1904; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1905; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1906; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
1907; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1908; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1909; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1910; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1911; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1912; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
1913; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
1914; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1915; GFX7-NEXT:    s_endpgm
1916;
1917; GFX8-LABEL: udot8_acc32_vecMul:
1918; GFX8:       ; %bb.0: ; %entry
1919; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1920; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1921; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1922; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1923; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1924; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1926; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1927; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1928; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1929; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1930; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1931; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1932; GFX8-NEXT:    flat_load_dword v1, v[2:3]
1933; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1934; GFX8-NEXT:    s_mov_b32 s10, -1
1935; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1936; GFX8-NEXT:    s_add_u32 s8, s8, s3
1937; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1938; GFX8-NEXT:    s_waitcnt vmcnt(1)
1939; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 28, v0
1940; GFX8-NEXT:    v_bfe_u32 v3, v0, 24, 4
1941; GFX8-NEXT:    v_bfe_u32 v4, v0, 20, 4
1942; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 4
1943; GFX8-NEXT:    v_bfe_u32 v6, v0, 12, 4
1944; GFX8-NEXT:    v_bfe_u32 v7, v0, 8, 4
1945; GFX8-NEXT:    v_bfe_u32 v8, v0, 4, 4
1946; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1947; GFX8-NEXT:    s_waitcnt vmcnt(0)
1948; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v1
1949; GFX8-NEXT:    v_bfe_u32 v10, v1, 24, 4
1950; GFX8-NEXT:    v_bfe_u32 v11, v1, 20, 4
1951; GFX8-NEXT:    v_bfe_u32 v12, v1, 16, 4
1952; GFX8-NEXT:    v_bfe_u32 v13, v1, 12, 4
1953; GFX8-NEXT:    v_bfe_u32 v14, v1, 8, 4
1954; GFX8-NEXT:    v_bfe_u32 v15, v1, 4, 4
1955; GFX8-NEXT:    v_and_b32_e32 v1, 15, v1
1956; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, s2
1958; GFX8-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1959; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1960; GFX8-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1961; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1962; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1963; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
1964; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v9, v0
1965; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1966; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1967; GFX8-NEXT:    flat_store_dword v[0:1], v2
1968; GFX8-NEXT:    s_endpgm
1969;
1970; GFX9-LABEL: udot8_acc32_vecMul:
1971; GFX9:       ; %bb.0: ; %entry
1972; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1973; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1974; GFX9-NEXT:    s_mov_b32 s10, -1
1975; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1976; GFX9-NEXT:    s_add_u32 s8, s8, s3
1977; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1978; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1979; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1980; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1981; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1983; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1984; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
1985; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1986; GFX9-NEXT:    s_waitcnt vmcnt(1)
1987; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
1988; GFX9-NEXT:    s_waitcnt vmcnt(0)
1989; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1990; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
1991; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
1992; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
1993; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
1994; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
1995; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
1996; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
1997; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
1998; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
1999; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
2000; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
2001; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
2002; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
2003; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
2004; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
2005; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
2006; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
2007; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
2008; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2009; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
2010; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
2011; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
2012; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
2013; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
2014; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v3, v10
2015; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
2016; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
2017; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2018; GFX9-NEXT:    s_endpgm
2019;
2020; GFX9-DL-LABEL: udot8_acc32_vecMul:
2021; GFX9-DL:       ; %bb.0: ; %entry
2022; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2023; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2024; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2025; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2026; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2027; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2028; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2029; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2030; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
2031; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2032; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2033; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
2034; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
2035; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2036; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2037; GFX9-DL-NEXT:    v_dot8_u32_u4 v0, v2, v3, s0
2038; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
2039; GFX9-DL-NEXT:    s_endpgm
2040;
2041; GFX10-DL-LABEL: udot8_acc32_vecMul:
2042; GFX10-DL:       ; %bb.0: ; %entry
2043; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2044; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2045; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2046; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2047; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2048; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2049; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2050; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2051; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2052; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX10-DL-NEXT:    s_clause 0x1
2054; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2055; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2056; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2057; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2058; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2059; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
2060; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2061; GFX10-DL-NEXT:    s_endpgm
2062                                              <8 x i4> addrspace(1)* %src2,
2063                                              i32 addrspace(1)* nocapture %dst) {
2064entry:
2065  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2066  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2067  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2068  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2069  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2070
2071  %cvec1 = zext <8 x i4> %vec1 to <8 x i32>
2072  %cvec2 = zext <8 x i4> %vec2 to <8 x i32>
2073
2074  %mul = mul <8 x i32> %cvec1, %cvec2
2075  %mul0 = extractelement <8 x i32> %mul, i64 0
2076  %mul1 = extractelement <8 x i32> %mul, i64 1
2077  %mul2 = extractelement <8 x i32> %mul, i64 2
2078  %mul3 = extractelement <8 x i32> %mul, i64 3
2079  %mul4 = extractelement <8 x i32> %mul, i64 4
2080  %mul5 = extractelement <8 x i32> %mul, i64 5
2081  %mul6 = extractelement <8 x i32> %mul, i64 6
2082  %mul7 = extractelement <8 x i32> %mul, i64 7
2083
2084  %acc = load i32, i32 addrspace(1)* %dst, align 4
2085  %add1 = add i32 %mul0, %acc
2086  %add2 = add i32 %add1, %mul1
2087  %add3 = add i32 %add2, %mul2
2088  %add4 = add i32 %add3, %mul3
2089  %add5 = add i32 %add4, %mul4
2090  %add6 = add i32 %add5, %mul5
2091  %add7 = add i32 %add6, %mul6
2092  %add8 = add i32 %add7, %mul7
2093
2094  store i32 %add8, i32 addrspace(1)* %dst, align 4
2095  ret void
2096}
2097
2098; TODO: Clean up the code(by default pk_mad_I16 should be generated), then
2099; support the pattern.
2100define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2101; GFX7-LABEL: udot8_acc16_vecMul:
2102; GFX7:       ; %bb.0: ; %entry
2103; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2104; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2105; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2106; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2107; GFX7-NEXT:    s_mov_b32 s14, -1
2108; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2109; GFX7-NEXT:    s_add_u32 s12, s12, s3
2110; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2111; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2112; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2113; GFX7-NEXT:    s_mov_b32 s10, 0
2114; GFX7-NEXT:    s_mov_b32 s11, s3
2115; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2116; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2117; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2118; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2119; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2120; GFX7-NEXT:    s_mov_b32 s2, -1
2121; GFX7-NEXT:    buffer_load_ushort v16, off, s[0:3], 0
2122; GFX7-NEXT:    s_mov_b32 s4, 0xf0000
2123; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2124; GFX7-NEXT:    s_waitcnt vmcnt(2)
2125; GFX7-NEXT:    v_bfe_u32 v7, v2, 20, 4
2126; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 12, v2
2127; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
2128; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
2129; GFX7-NEXT:    v_bfe_u32 v4, v2, 12, 4
2130; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 4
2131; GFX7-NEXT:    v_and_b32_e32 v6, 15, v2
2132; GFX7-NEXT:    v_alignbit_b32 v2, v7, v2, 16
2133; GFX7-NEXT:    v_and_b32_e32 v7, s4, v8
2134; GFX7-NEXT:    s_waitcnt vmcnt(1)
2135; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 12, v0
2136; GFX7-NEXT:    v_or_b32_e32 v6, v6, v7
2137; GFX7-NEXT:    v_and_b32_e32 v7, s4, v8
2138; GFX7-NEXT:    v_and_b32_e32 v13, 15, v0
2139; GFX7-NEXT:    v_or_b32_e32 v7, v13, v7
2140; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
2141; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v7
2142; GFX7-NEXT:    v_and_b32_e32 v6, 15, v6
2143; GFX7-NEXT:    v_and_b32_e32 v7, 15, v7
2144; GFX7-NEXT:    s_waitcnt vmcnt(0)
2145; GFX7-NEXT:    v_mad_u32_u24 v6, v6, v7, v16
2146; GFX7-NEXT:    v_bfe_u32 v12, v0, 8, 4
2147; GFX7-NEXT:    v_mad_u32_u24 v6, v8, v13, v6
2148; GFX7-NEXT:    v_bfe_u32 v14, v0, 20, 4
2149; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
2150; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
2151; GFX7-NEXT:    v_bfe_u32 v11, v0, 12, 4
2152; GFX7-NEXT:    v_alignbit_b32 v0, v14, v0, 16
2153; GFX7-NEXT:    v_mad_u32_u24 v5, v5, v12, v6
2154; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
2155; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
2156; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
2157; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2158; GFX7-NEXT:    v_mad_u32_u24 v4, v4, v11, v5
2159; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v4
2160; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v14, v0
2161; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2162; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
2163; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2164; GFX7-NEXT:    s_endpgm
2165;
2166; GFX8-LABEL: udot8_acc16_vecMul:
2167; GFX8:       ; %bb.0: ; %entry
2168; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2169; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2170; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2171; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2172; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2173; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2174; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2175; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2176; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2177; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2178; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2179; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2180; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2181; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2182; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2183; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2184; GFX8-NEXT:    flat_load_ushort v18, v[2:3]
2185; GFX8-NEXT:    s_mov_b32 s10, -1
2186; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2187; GFX8-NEXT:    s_add_u32 s8, s8, s3
2188; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2189; GFX8-NEXT:    s_waitcnt vmcnt(2)
2190; GFX8-NEXT:    v_and_b32_e32 v1, 15, v4
2191; GFX8-NEXT:    v_bfe_u32 v5, v4, 4, 4
2192; GFX8-NEXT:    v_bfe_u32 v6, v4, 8, 4
2193; GFX8-NEXT:    v_bfe_u32 v7, v4, 12, 4
2194; GFX8-NEXT:    v_bfe_u32 v8, v4, 16, 4
2195; GFX8-NEXT:    s_waitcnt vmcnt(1)
2196; GFX8-NEXT:    v_and_b32_e32 v11, 15, v0
2197; GFX8-NEXT:    v_bfe_u32 v12, v0, 4, 4
2198; GFX8-NEXT:    s_waitcnt vmcnt(0)
2199; GFX8-NEXT:    v_mad_u16 v1, v1, v11, v18
2200; GFX8-NEXT:    v_bfe_u32 v13, v0, 8, 4
2201; GFX8-NEXT:    v_mad_u16 v1, v5, v12, v1
2202; GFX8-NEXT:    v_bfe_u32 v14, v0, 12, 4
2203; GFX8-NEXT:    v_mad_u16 v1, v6, v13, v1
2204; GFX8-NEXT:    v_bfe_u32 v15, v0, 16, 4
2205; GFX8-NEXT:    v_mad_u16 v1, v7, v14, v1
2206; GFX8-NEXT:    v_bfe_u32 v9, v4, 20, 4
2207; GFX8-NEXT:    v_bfe_u32 v16, v0, 20, 4
2208; GFX8-NEXT:    v_mad_u16 v1, v8, v15, v1
2209; GFX8-NEXT:    v_bfe_u32 v10, v4, 24, 4
2210; GFX8-NEXT:    v_bfe_u32 v17, v0, 24, 4
2211; GFX8-NEXT:    v_mad_u16 v1, v9, v16, v1
2212; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 28, v4
2213; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
2214; GFX8-NEXT:    v_mad_u16 v1, v10, v17, v1
2215; GFX8-NEXT:    v_mad_u16 v0, v4, v0, v1
2216; GFX8-NEXT:    flat_store_short v[2:3], v0
2217; GFX8-NEXT:    s_endpgm
2218;
2219; GFX9-LABEL: udot8_acc16_vecMul:
2220; GFX9:       ; %bb.0: ; %entry
2221; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2222; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2223; GFX9-NEXT:    s_mov_b32 s10, -1
2224; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2225; GFX9-NEXT:    s_add_u32 s8, s8, s3
2226; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2227; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2228; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2229; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
2230; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2231; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2232; GFX9-NEXT:    global_load_dword v3, v0, s[4:5]
2233; GFX9-NEXT:    global_load_dword v4, v0, s[6:7]
2234; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2235; GFX9-NEXT:    s_waitcnt vmcnt(1)
2236; GFX9-NEXT:    v_bfe_u32 v0, v3, 24, 4
2237; GFX9-NEXT:    v_bfe_u32 v6, v3, 16, 4
2238; GFX9-NEXT:    v_bfe_u32 v8, v3, 8, 4
2239; GFX9-NEXT:    s_waitcnt vmcnt(0)
2240; GFX9-NEXT:    v_bfe_u32 v11, v4, 24, 4
2241; GFX9-NEXT:    v_bfe_u32 v13, v4, 16, 4
2242; GFX9-NEXT:    v_bfe_u32 v15, v4, 8, 4
2243; GFX9-NEXT:    v_and_b32_e32 v17, 15, v4
2244; GFX9-NEXT:    v_and_b32_e32 v10, 15, v3
2245; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
2246; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
2247; GFX9-NEXT:    v_bfe_u32 v7, v3, 20, 4
2248; GFX9-NEXT:    v_and_b32_e32 v6, v2, v6
2249; GFX9-NEXT:    v_bfe_u32 v9, v3, 12, 4
2250; GFX9-NEXT:    v_and_b32_e32 v8, v2, v8
2251; GFX9-NEXT:    v_bfe_u32 v3, v3, 4, 4
2252; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v4
2253; GFX9-NEXT:    v_bfe_u32 v14, v4, 20, 4
2254; GFX9-NEXT:    v_bfe_u32 v16, v4, 12, 4
2255; GFX9-NEXT:    v_bfe_u32 v4, v4, 4, 4
2256; GFX9-NEXT:    v_and_b32_e32 v17, v2, v17
2257; GFX9-NEXT:    v_and_b32_e32 v11, v2, v11
2258; GFX9-NEXT:    v_and_b32_e32 v13, v2, v13
2259; GFX9-NEXT:    v_and_b32_e32 v15, v2, v15
2260; GFX9-NEXT:    v_and_b32_e32 v2, v2, v10
2261; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v17
2262; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
2263; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
2264; GFX9-NEXT:    global_load_ushort v4, v1, s[2:3]
2265; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
2266; GFX9-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
2267; GFX9-NEXT:    v_lshl_or_b32 v5, v14, 16, v13
2268; GFX9-NEXT:    v_lshl_or_b32 v7, v16, 16, v15
2269; GFX9-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
2270; GFX9-NEXT:    v_pk_mul_lo_u16 v3, v6, v5
2271; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v8, v7
2272; GFX9-NEXT:    v_lshl_or_b32 v10, v12, 16, v11
2273; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v10
2274; GFX9-NEXT:    s_waitcnt vmcnt(0)
2275; GFX9-NEXT:    v_add_u16_e32 v4, v2, v4
2276; GFX9-NEXT:    v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2277; GFX9-NEXT:    v_add_u16_e32 v2, v2, v5
2278; GFX9-NEXT:    v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2279; GFX9-NEXT:    v_add_u16_e32 v2, v2, v3
2280; GFX9-NEXT:    v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2281; GFX9-NEXT:    v_add_u16_e32 v2, v2, v0
2282; GFX9-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2283; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
2284; GFX9-NEXT:    s_endpgm
2285;
2286; GFX9-DL-LABEL: udot8_acc16_vecMul:
2287; GFX9-DL:       ; %bb.0: ; %entry
2288; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2289; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2290; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2291; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2292; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2293; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2294; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2295; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2296; GFX9-DL-NEXT:    v_mov_b32_e32 v2, 0xffff
2297; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
2298; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2299; GFX9-DL-NEXT:    global_load_dword v3, v0, s[4:5]
2300; GFX9-DL-NEXT:    global_load_dword v4, v0, s[6:7]
2301; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2302; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2303; GFX9-DL-NEXT:    v_bfe_u32 v0, v3, 24, 4
2304; GFX9-DL-NEXT:    v_bfe_u32 v6, v3, 16, 4
2305; GFX9-DL-NEXT:    v_bfe_u32 v8, v3, 8, 4
2306; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2307; GFX9-DL-NEXT:    v_bfe_u32 v11, v4, 24, 4
2308; GFX9-DL-NEXT:    v_bfe_u32 v13, v4, 16, 4
2309; GFX9-DL-NEXT:    v_bfe_u32 v15, v4, 8, 4
2310; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v4
2311; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v3
2312; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
2313; GFX9-DL-NEXT:    v_and_b32_e32 v0, v2, v0
2314; GFX9-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
2315; GFX9-DL-NEXT:    v_and_b32_e32 v6, v2, v6
2316; GFX9-DL-NEXT:    v_bfe_u32 v9, v3, 12, 4
2317; GFX9-DL-NEXT:    v_and_b32_e32 v8, v2, v8
2318; GFX9-DL-NEXT:    v_bfe_u32 v3, v3, 4, 4
2319; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v4
2320; GFX9-DL-NEXT:    v_bfe_u32 v14, v4, 20, 4
2321; GFX9-DL-NEXT:    v_bfe_u32 v16, v4, 12, 4
2322; GFX9-DL-NEXT:    v_bfe_u32 v4, v4, 4, 4
2323; GFX9-DL-NEXT:    v_and_b32_e32 v17, v2, v17
2324; GFX9-DL-NEXT:    v_and_b32_e32 v11, v2, v11
2325; GFX9-DL-NEXT:    v_and_b32_e32 v13, v2, v13
2326; GFX9-DL-NEXT:    v_and_b32_e32 v15, v2, v15
2327; GFX9-DL-NEXT:    v_and_b32_e32 v2, v2, v10
2328; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v17
2329; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
2330; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
2331; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
2332; GFX9-DL-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
2333; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v7, 16, v6
2334; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v14, 16, v13
2335; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v16, 16, v15
2336; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v9, 16, v8
2337; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v3, v6, v5
2338; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v8, v7
2339; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v12, 16, v11
2340; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v0, v0, v10
2341; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2342; GFX9-DL-NEXT:    v_add_u16_e32 v4, v2, v4
2343; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2344; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v5
2345; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2346; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v3
2347; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2348; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v0
2349; GFX9-DL-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2350; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
2351; GFX9-DL-NEXT:    s_endpgm
2352;
2353; GFX10-DL-LABEL: udot8_acc16_vecMul:
2354; GFX10-DL:       ; %bb.0: ; %entry
2355; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2356; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2357; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2358; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
2359; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2360; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2361; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2362; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2363; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2364; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2365; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2366; GFX10-DL-NEXT:    s_clause 0x1
2367; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2368; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2369; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2370; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
2371; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2372; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v1
2373; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2374; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v2
2375; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 4, 4
2376; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
2377; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 8, 4
2378; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
2379; GFX10-DL-NEXT:    v_and_b32_e32 v6, v4, v6
2380; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
2381; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
2382; GFX10-DL-NEXT:    v_and_b32_e32 v12, v4, v12
2383; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v9, 16, v7
2384; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v10, 16, v6
2385; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
2386; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
2387; GFX10-DL-NEXT:    v_and_b32_e32 v13, v4, v13
2388; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
2389; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v6, v7, v6
2390; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 16, 4
2391; GFX10-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v12
2392; GFX10-DL-NEXT:    v_lshl_or_b32 v10, v10, 16, v13
2393; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
2394; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
2395; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2396; GFX10-DL-NEXT:    v_add_nc_u16 v3, v6, v3
2397; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 20, 4
2398; GFX10-DL-NEXT:    v_and_b32_e32 v11, v4, v11
2399; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v9, v9, v10
2400; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
2401; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v12
2402; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v7
2403; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 24, 4
2404; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v11
2405; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2406; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
2407; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2408; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
2409; GFX10-DL-NEXT:    v_and_b32_e32 v9, v4, v10
2410; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v5
2411; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v6
2412; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
2413; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v9
2414; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
2415; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
2416; GFX10-DL-NEXT:    v_add_nc_u16 v1, v3, v1
2417; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v2
2418; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v5
2419; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2420; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
2421; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
2422; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
2423; GFX10-DL-NEXT:    s_endpgm
2424                                              <8 x i4> addrspace(1)* %src2,
2425                                              i16 addrspace(1)* nocapture %dst) {
2426entry:
2427  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2428  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2429  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2430  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2431  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2432
2433  %cvec1 = zext <8 x i4> %vec1 to <8 x i16>
2434  %cvec2 = zext <8 x i4> %vec2 to <8 x i16>
2435
2436  %mul = mul <8 x i16> %cvec1, %cvec2
2437  %mul0 = extractelement <8 x i16> %mul, i64 0
2438  %mul1 = extractelement <8 x i16> %mul, i64 1
2439  %mul2 = extractelement <8 x i16> %mul, i64 2
2440  %mul3 = extractelement <8 x i16> %mul, i64 3
2441  %mul4 = extractelement <8 x i16> %mul, i64 4
2442  %mul5 = extractelement <8 x i16> %mul, i64 5
2443  %mul6 = extractelement <8 x i16> %mul, i64 6
2444  %mul7 = extractelement <8 x i16> %mul, i64 7
2445
2446  %acc = load i16, i16 addrspace(1)* %dst, align 4
2447  %add1 = add i16 %mul0, %acc
2448  %add2 = add i16 %add1, %mul1
2449  %add3 = add i16 %add2, %mul2
2450  %add4 = add i16 %add3, %mul3
2451  %add5 = add i16 %add4, %mul4
2452  %add6 = add i16 %add5, %mul5
2453  %add7 = add i16 %add6, %mul6
2454  %add8 = add i16 %add7, %mul7
2455
2456  store i16 %add8, i16 addrspace(1)* %dst, align 4
2457  ret void
2458}
2459
2460; TODO: Cleanup the code to generate MAD; pattern should be recognized then.
2461define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2462; GFX7-LABEL: udot8_acc8_vecMul:
2463; GFX7:       ; %bb.0: ; %entry
2464; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2465; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2466; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2467; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2468; GFX7-NEXT:    s_mov_b32 s14, -1
2469; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2470; GFX7-NEXT:    s_add_u32 s12, s12, s3
2471; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2472; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2473; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2474; GFX7-NEXT:    s_mov_b32 s10, 0
2475; GFX7-NEXT:    s_mov_b32 s11, s3
2476; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2477; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2478; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2479; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2480; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2481; GFX7-NEXT:    s_mov_b32 s2, -1
2482; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
2483; GFX7-NEXT:    s_movk_i32 s4, 0xf00
2484; GFX7-NEXT:    v_mov_b32_e32 v3, 0xf00
2485; GFX7-NEXT:    s_movk_i32 s5, 0xf0f
2486; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2487; GFX7-NEXT:    s_waitcnt vmcnt(2)
2488; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 28, v2
2489; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v2
2490; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 4, v2
2491; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
2492; GFX7-NEXT:    v_bfe_u32 v1, v2, 8, 4
2493; GFX7-NEXT:    v_and_b32_e32 v5, 15, v2
2494; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 4
2495; GFX7-NEXT:    v_alignbit_b32 v2, v6, v2, 24
2496; GFX7-NEXT:    v_and_b32_e32 v6, s4, v9
2497; GFX7-NEXT:    s_waitcnt vmcnt(1)
2498; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
2499; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v0
2500; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
2501; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
2502; GFX7-NEXT:    v_and_b32_e32 v6, v3, v9
2503; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 4
2504; GFX7-NEXT:    v_and_b32_e32 v3, v3, v11
2505; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
2506; GFX7-NEXT:    v_or_b32_e32 v3, v10, v3
2507; GFX7-NEXT:    v_and_b32_e32 v12, 15, v0
2508; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2509; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
2510; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
2511; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2512; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
2513; GFX7-NEXT:    v_or_b32_e32 v3, v6, v3
2514; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
2515; GFX7-NEXT:    v_bfe_u32 v14, v0, 16, 4
2516; GFX7-NEXT:    v_alignbit_b32 v0, v13, v0, 24
2517; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
2518; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
2519; GFX7-NEXT:    v_and_b32_e32 v4, s4, v15
2520; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
2521; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
2522; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2523; GFX7-NEXT:    v_and_b32_e32 v6, 15, v1
2524; GFX7-NEXT:    v_and_b32_e32 v12, 15, v3
2525; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
2526; GFX7-NEXT:    v_or_b32_e32 v4, v14, v4
2527; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2528; GFX7-NEXT:    v_bfe_u32 v7, v1, 8, 4
2529; GFX7-NEXT:    v_bfe_u32 v13, v3, 8, 4
2530; GFX7-NEXT:    s_waitcnt vmcnt(0)
2531; GFX7-NEXT:    v_mad_u32_u24 v6, v6, v12, v16
2532; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
2533; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
2534; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 24, v3
2535; GFX7-NEXT:    v_bfe_u32 v1, v1, 16, 4
2536; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 4
2537; GFX7-NEXT:    v_mad_u32_u24 v6, v7, v13, v6
2538; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, v6
2539; GFX7-NEXT:    v_and_b32_e32 v8, 15, v2
2540; GFX7-NEXT:    v_and_b32_e32 v14, 15, v0
2541; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v10, v1
2542; GFX7-NEXT:    v_bfe_u32 v9, v2, 8, 4
2543; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
2544; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v14, v1
2545; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
2546; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
2547; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 4
2548; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 4
2549; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v15, v1
2550; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2551; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v11, v0
2552; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2553; GFX7-NEXT:    s_endpgm
2554;
2555; GFX8-LABEL: udot8_acc8_vecMul:
2556; GFX8:       ; %bb.0: ; %entry
2557; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2558; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2559; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2560; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2561; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2562; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2563; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2564; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2565; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2566; GFX8-NEXT:    v_mov_b32_e32 v3, s7
2567; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
2568; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
2569; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2570; GFX8-NEXT:    flat_load_dword v2, v[2:3]
2571; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2572; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2573; GFX8-NEXT:    flat_load_ubyte v5, v[0:1]
2574; GFX8-NEXT:    s_mov_b32 s10, -1
2575; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2576; GFX8-NEXT:    s_add_u32 s8, s8, s3
2577; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2578; GFX8-NEXT:    s_waitcnt vmcnt(2)
2579; GFX8-NEXT:    v_bfe_u32 v3, v4, 20, 4
2580; GFX8-NEXT:    s_waitcnt vmcnt(1)
2581; GFX8-NEXT:    v_bfe_u32 v13, v2, 20, 4
2582; GFX8-NEXT:    v_bfe_u32 v7, v4, 24, 4
2583; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 28, v4
2584; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
2585; GFX8-NEXT:    v_bfe_u32 v14, v2, 24, 4
2586; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2587; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 4
2588; GFX8-NEXT:    v_bfe_u32 v12, v2, 16, 4
2589; GFX8-NEXT:    v_bfe_u32 v9, v4, 8, 4
2590; GFX8-NEXT:    v_bfe_u32 v16, v2, 8, 4
2591; GFX8-NEXT:    v_bfe_u32 v10, v4, 12, 4
2592; GFX8-NEXT:    v_and_b32_e32 v11, 15, v4
2593; GFX8-NEXT:    v_bfe_u32 v17, v2, 12, 4
2594; GFX8-NEXT:    v_and_b32_e32 v18, 15, v2
2595; GFX8-NEXT:    v_bfe_u32 v4, v4, 4, 4
2596; GFX8-NEXT:    v_bfe_u32 v2, v2, 4, 4
2597; GFX8-NEXT:    v_mul_lo_u16_e32 v13, v7, v14
2598; GFX8-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2599; GFX8-NEXT:    v_mul_lo_u16_e32 v19, v6, v12
2600; GFX8-NEXT:    v_mul_lo_u16_e32 v9, v9, v16
2601; GFX8-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2602; GFX8-NEXT:    v_mul_lo_u16_e32 v11, v11, v18
2603; GFX8-NEXT:    v_mul_lo_u16_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2604; GFX8-NEXT:    v_or_b32_e32 v8, v13, v8
2605; GFX8-NEXT:    v_or_b32_e32 v9, v9, v10
2606; GFX8-NEXT:    v_or_b32_e32 v10, v11, v4
2607; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
2608; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
2609; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2610; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
2611; GFX8-NEXT:    v_or_b32_e32 v4, v4, v2
2612; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
2613; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
2614; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
2615; GFX8-NEXT:    s_waitcnt vmcnt(0)
2616; GFX8-NEXT:    v_add_u16_e32 v3, v10, v5
2617; GFX8-NEXT:    v_add_u16_e32 v3, v3, v4
2618; GFX8-NEXT:    v_add_u16_e32 v3, v3, v9
2619; GFX8-NEXT:    v_add_u16_e32 v2, v3, v2
2620; GFX8-NEXT:    v_mad_u16 v2, v6, v12, v2
2621; GFX8-NEXT:    v_add_u16_e32 v2, v2, v11
2622; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
2623; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
2624; GFX8-NEXT:    v_add_u16_e32 v2, v2, v8
2625; GFX8-NEXT:    flat_store_byte v[0:1], v2
2626; GFX8-NEXT:    s_endpgm
2627;
2628; GFX9-LABEL: udot8_acc8_vecMul:
2629; GFX9:       ; %bb.0: ; %entry
2630; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2631; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2632; GFX9-NEXT:    s_mov_b32 s10, -1
2633; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2634; GFX9-NEXT:    s_add_u32 s8, s8, s3
2635; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2636; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2637; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2638; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2639; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2640; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2641; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2642; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2643; GFX9-NEXT:    global_load_ubyte v4, v3, s[2:3]
2644; GFX9-NEXT:    s_waitcnt vmcnt(2)
2645; GFX9-NEXT:    v_bfe_u32 v0, v1, 20, 4
2646; GFX9-NEXT:    s_waitcnt vmcnt(1)
2647; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
2648; GFX9-NEXT:    v_bfe_u32 v6, v1, 24, 4
2649; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 28, v1
2650; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
2651; GFX9-NEXT:    v_bfe_u32 v13, v2, 24, 4
2652; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2653; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 4
2654; GFX9-NEXT:    v_bfe_u32 v11, v2, 16, 4
2655; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
2656; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
2657; GFX9-NEXT:    v_bfe_u32 v9, v1, 12, 4
2658; GFX9-NEXT:    v_and_b32_e32 v10, 15, v1
2659; GFX9-NEXT:    v_bfe_u32 v16, v2, 12, 4
2660; GFX9-NEXT:    v_and_b32_e32 v17, 15, v2
2661; GFX9-NEXT:    v_bfe_u32 v1, v1, 4, 4
2662; GFX9-NEXT:    v_bfe_u32 v2, v2, 4, 4
2663; GFX9-NEXT:    v_mul_lo_u16_e32 v12, v6, v13
2664; GFX9-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2665; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v5, v11
2666; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2667; GFX9-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
2668; GFX9-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2669; GFX9-NEXT:    v_mul_lo_u16_e32 v10, v10, v17
2670; GFX9-NEXT:    v_or_b32_e32 v7, v12, v7
2671; GFX9-NEXT:    v_or_b32_e32 v8, v8, v9
2672; GFX9-NEXT:    v_or_b32_e32 v1, v18, v0
2673; GFX9-NEXT:    v_or_b32_e32 v9, v10, v2
2674; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
2675; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2676; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
2677; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
2678; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
2679; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
2680; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
2681; GFX9-NEXT:    s_waitcnt vmcnt(0)
2682; GFX9-NEXT:    v_add_u16_e32 v1, v9, v4
2683; GFX9-NEXT:    v_add_u16_e32 v1, v1, v2
2684; GFX9-NEXT:    v_add_u16_e32 v1, v1, v8
2685; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
2686; GFX9-NEXT:    v_mad_legacy_u16 v0, v5, v11, v0
2687; GFX9-NEXT:    v_add_u16_e32 v0, v0, v10
2688; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2689; GFX9-NEXT:    v_mad_legacy_u16 v0, v6, v13, v0
2690; GFX9-NEXT:    v_add_u16_e32 v0, v0, v7
2691; GFX9-NEXT:    global_store_byte v3, v0, s[2:3]
2692; GFX9-NEXT:    s_endpgm
2693;
2694; GFX9-DL-LABEL: udot8_acc8_vecMul:
2695; GFX9-DL:       ; %bb.0: ; %entry
2696; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2697; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2698; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2699; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2700; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2701; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2702; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2703; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2704; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0
2705; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2706; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2707; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2708; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2709; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[2:3]
2710; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2711; GFX9-DL-NEXT:    v_bfe_u32 v0, v1, 20, 4
2712; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2713; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 20, 4
2714; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 24, 4
2715; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 28, v1
2716; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
2717; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 24, 4
2718; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2719; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
2720; GFX9-DL-NEXT:    v_bfe_u32 v11, v2, 16, 4
2721; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
2722; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 8, 4
2723; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
2724; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v1
2725; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 12, 4
2726; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v2
2727; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 4, 4
2728; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 4, 4
2729; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v12, v6, v13
2730; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2731; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v5, v11
2732; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2733; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
2734; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2735; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v10, v10, v17
2736; GFX9-DL-NEXT:    v_or_b32_e32 v7, v12, v7
2737; GFX9-DL-NEXT:    v_or_b32_e32 v8, v8, v9
2738; GFX9-DL-NEXT:    v_or_b32_e32 v1, v18, v0
2739; GFX9-DL-NEXT:    v_or_b32_e32 v9, v10, v2
2740; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
2741; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2742; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
2743; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v0
2744; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
2745; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
2746; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
2747; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2748; GFX9-DL-NEXT:    v_add_u16_e32 v1, v9, v4
2749; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
2750; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v8
2751; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
2752; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v5, v11, v0
2753; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v10
2754; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2755; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v6, v13, v0
2756; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v7
2757; GFX9-DL-NEXT:    global_store_byte v3, v0, s[2:3]
2758; GFX9-DL-NEXT:    s_endpgm
2759;
2760; GFX10-DL-LABEL: udot8_acc8_vecMul:
2761; GFX10-DL:       ; %bb.0: ; %entry
2762; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2763; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2764; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2765; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0
2766; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2767; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2768; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2769; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2770; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2771; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2772; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2773; GFX10-DL-NEXT:    s_clause 0x1
2774; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2775; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2776; GFX10-DL-NEXT:    global_load_ubyte v3, v4, s[0:1]
2777; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2778; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
2779; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2780; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
2781; GFX10-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
2782; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
2783; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 28, v1
2784; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
2785; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v9, v10
2786; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
2787; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v8, v13
2788; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 20, 4
2789; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 24, 4
2790; GFX10-DL-NEXT:    v_and_b32_e32 v11, 15, v1
2791; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v9
2792; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 4, 4
2793; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 4, 4
2794; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v7, v14
2795; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 20, 4
2796; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 24, 4
2797; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 16, 4
2798; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v2
2799; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v1, v15
2800; GFX10-DL-NEXT:    v_or_b32_e32 v8, v8, v9
2801; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v0, v10
2802; GFX10-DL-NEXT:    v_mul_lo_u16 v10, v6, v13
2803; GFX10-DL-NEXT:    v_lshlrev_b16 v7, 8, v7
2804; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v11, v2
2805; GFX10-DL-NEXT:    v_lshlrev_b16 v1, 8, v1
2806; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
2807; GFX10-DL-NEXT:    v_mul_lo_u16 v11, v5, v12
2808; GFX10-DL-NEXT:    v_or_b32_e32 v7, v10, v7
2809; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v9
2810; GFX10-DL-NEXT:    v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2811; GFX10-DL-NEXT:    v_or_b32_e32 v1, v2, v1
2812; GFX10-DL-NEXT:    v_or_b32_e32 v2, v11, v9
2813; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
2814; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
2815; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2816; GFX10-DL-NEXT:    v_add_nc_u16 v3, v1, v3
2817; GFX10-DL-NEXT:    v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2818; GFX10-DL-NEXT:    v_add_nc_u16 v9, v3, v10
2819; GFX10-DL-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
2820; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
2821; GFX10-DL-NEXT:    v_add_nc_u16 v0, v9, v8
2822; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v2
2823; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v12, v0
2824; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
2825; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v7
2826; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v13, v0
2827; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
2828; GFX10-DL-NEXT:    global_store_byte v4, v0, s[0:1]
2829; GFX10-DL-NEXT:    s_endpgm
2830                                             <8 x i4> addrspace(1)* %src2,
2831                                             i8 addrspace(1)* nocapture %dst) {
2832entry:
2833  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2834  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2835  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2836  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2837  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2838
2839  %cvec1 = zext <8 x i4> %vec1 to <8 x i8>
2840  %cvec2 = zext <8 x i4> %vec2 to <8 x i8>
2841
2842  %mul = mul <8 x i8> %cvec1, %cvec2
2843  %mul0 = extractelement <8 x i8> %mul, i64 0
2844  %mul1 = extractelement <8 x i8> %mul, i64 1
2845  %mul2 = extractelement <8 x i8> %mul, i64 2
2846  %mul3 = extractelement <8 x i8> %mul, i64 3
2847  %mul4 = extractelement <8 x i8> %mul, i64 4
2848  %mul5 = extractelement <8 x i8> %mul, i64 5
2849  %mul6 = extractelement <8 x i8> %mul, i64 6
2850  %mul7 = extractelement <8 x i8> %mul, i64 7
2851
2852  %acc = load i8, i8 addrspace(1)* %dst, align 4
2853  %add1 = add i8 %mul0, %acc
2854  %add2 = add i8 %add1, %mul1
2855  %add3 = add i8 %add2, %mul2
2856  %add4 = add i8 %add3, %mul3
2857  %add5 = add i8 %add4, %mul4
2858  %add6 = add i8 %add5, %mul5
2859  %add7 = add i8 %add6, %mul6
2860  %add8 = add i8 %add7, %mul7
2861
2862  store i8 %add8, i8 addrspace(1)* %dst, align 4
2863  ret void
2864}
2865
2866; TODO: Once the adictional "and+add" are removed, the pattern will be recognized.
2867define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
2868; GFX7-LABEL: udot8_acc4_vecMul:
2869; GFX7:       ; %bb.0: ; %entry
2870; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2871; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2872; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2873; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2874; GFX7-NEXT:    s_mov_b32 s14, -1
2875; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2876; GFX7-NEXT:    s_add_u32 s12, s12, s3
2877; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2878; GFX7-NEXT:    s_mov_b32 s10, 0
2879; GFX7-NEXT:    s_mov_b32 s11, s3
2880; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2881; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2882; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2883; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2884; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2885; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2886; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2887; GFX7-NEXT:    s_mov_b32 s2, -1
2888; GFX7-NEXT:    buffer_load_ubyte v16, off, s[0:3], 0
2889; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2890; GFX7-NEXT:    s_waitcnt vmcnt(2)
2891; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
2892; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
2893; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
2894; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
2895; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
2896; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
2897; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
2898; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
2899; GFX7-NEXT:    s_waitcnt vmcnt(1)
2900; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
2901; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
2902; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
2903; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
2904; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
2905; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
2906; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
2907; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2908; GFX7-NEXT:    s_waitcnt vmcnt(0)
2909; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
2910; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
2911; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
2912; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
2913; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
2914; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2915; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2916; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
2917; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2918; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2919; GFX7-NEXT:    s_endpgm
2920;
2921; GFX8-LABEL: udot8_acc4_vecMul:
2922; GFX8:       ; %bb.0: ; %entry
2923; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2924; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2925; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2926; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2927; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2928; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2929; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2930; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2931; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2932; GFX8-NEXT:    flat_load_dword v4, v[0:1]
2933; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2934; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2935; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2936; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2937; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2938; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2939; GFX8-NEXT:    s_mov_b32 s10, -1
2940; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2941; GFX8-NEXT:    s_add_u32 s8, s8, s3
2942; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2943; GFX8-NEXT:    s_waitcnt vmcnt(1)
2944; GFX8-NEXT:    v_and_b32_e32 v1, 15, v4
2945; GFX8-NEXT:    v_bfe_u32 v5, v4, 4, 4
2946; GFX8-NEXT:    v_bfe_u32 v6, v4, 8, 4
2947; GFX8-NEXT:    v_bfe_u32 v7, v4, 12, 4
2948; GFX8-NEXT:    s_waitcnt vmcnt(0)
2949; GFX8-NEXT:    v_and_b32_e32 v11, 15, v0
2950; GFX8-NEXT:    v_bfe_u32 v12, v0, 4, 4
2951; GFX8-NEXT:    v_bfe_u32 v13, v0, 8, 4
2952; GFX8-NEXT:    v_bfe_u32 v14, v0, 12, 4
2953; GFX8-NEXT:    v_bfe_u32 v8, v4, 16, 4
2954; GFX8-NEXT:    v_bfe_u32 v15, v0, 16, 4
2955; GFX8-NEXT:    v_bfe_u32 v9, v4, 20, 4
2956; GFX8-NEXT:    v_bfe_u32 v10, v4, 24, 4
2957; GFX8-NEXT:    v_bfe_u32 v16, v0, 20, 4
2958; GFX8-NEXT:    v_bfe_u32 v17, v0, 24, 4
2959; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 28, v4
2960; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
2961; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v4, v0
2962; GFX8-NEXT:    v_mul_u32_u24_e32 v4, v10, v17
2963; GFX8-NEXT:    flat_load_ubyte v10, v[2:3]
2964; GFX8-NEXT:    v_mul_u32_u24_e32 v1, v1, v11
2965; GFX8-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
2966; GFX8-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
2967; GFX8-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
2968; GFX8-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
2969; GFX8-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
2970; GFX8-NEXT:    s_waitcnt vmcnt(0)
2971; GFX8-NEXT:    v_add_u16_e32 v1, v1, v10
2972; GFX8-NEXT:    v_add_u16_e32 v1, v1, v5
2973; GFX8-NEXT:    v_add_u16_e32 v1, v1, v6
2974; GFX8-NEXT:    v_add_u16_e32 v1, v1, v7
2975; GFX8-NEXT:    v_add_u16_e32 v1, v1, v8
2976; GFX8-NEXT:    v_add_u16_e32 v1, v1, v9
2977; GFX8-NEXT:    v_add_u16_e32 v1, v1, v4
2978; GFX8-NEXT:    v_add_u16_e32 v0, v1, v0
2979; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
2980; GFX8-NEXT:    flat_store_byte v[2:3], v0
2981; GFX8-NEXT:    s_endpgm
2982;
2983; GFX9-LABEL: udot8_acc4_vecMul:
2984; GFX9:       ; %bb.0: ; %entry
2985; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2986; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2987; GFX9-NEXT:    s_mov_b32 s10, -1
2988; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2989; GFX9-NEXT:    s_add_u32 s8, s8, s3
2990; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2991; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2992; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2993; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2994; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2995; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2996; GFX9-NEXT:    global_load_dword v2, v0, s[4:5]
2997; GFX9-NEXT:    global_load_dword v3, v0, s[6:7]
2998; GFX9-NEXT:    s_waitcnt vmcnt(1)
2999; GFX9-NEXT:    v_and_b32_e32 v0, 15, v2
3000; GFX9-NEXT:    s_waitcnt vmcnt(0)
3001; GFX9-NEXT:    v_and_b32_e32 v10, 15, v3
3002; GFX9-NEXT:    v_bfe_u32 v4, v2, 4, 4
3003; GFX9-NEXT:    v_bfe_u32 v11, v3, 4, 4
3004; GFX9-NEXT:    v_bfe_u32 v5, v2, 8, 4
3005; GFX9-NEXT:    v_bfe_u32 v12, v3, 8, 4
3006; GFX9-NEXT:    v_bfe_u32 v6, v2, 12, 4
3007; GFX9-NEXT:    v_bfe_u32 v13, v3, 12, 4
3008; GFX9-NEXT:    v_bfe_u32 v7, v2, 16, 4
3009; GFX9-NEXT:    v_bfe_u32 v14, v3, 16, 4
3010; GFX9-NEXT:    v_bfe_u32 v8, v2, 20, 4
3011; GFX9-NEXT:    v_bfe_u32 v9, v2, 24, 4
3012; GFX9-NEXT:    v_bfe_u32 v15, v3, 20, 4
3013; GFX9-NEXT:    v_bfe_u32 v16, v3, 24, 4
3014; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3015; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
3016; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
3017; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v9, v16
3018; GFX9-NEXT:    global_load_ubyte v9, v1, s[2:3]
3019; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v10
3020; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
3021; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
3022; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
3023; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
3024; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
3025; GFX9-NEXT:    s_waitcnt vmcnt(0)
3026; GFX9-NEXT:    v_add_u16_e32 v0, v0, v9
3027; GFX9-NEXT:    v_add_u16_e32 v0, v0, v4
3028; GFX9-NEXT:    v_add_u16_e32 v0, v0, v5
3029; GFX9-NEXT:    v_add_u16_e32 v0, v0, v6
3030; GFX9-NEXT:    v_add_u16_e32 v0, v0, v7
3031; GFX9-NEXT:    v_add_u16_e32 v0, v0, v8
3032; GFX9-NEXT:    v_add_u16_e32 v0, v0, v3
3033; GFX9-NEXT:    v_add_u16_e32 v0, v0, v2
3034; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
3035; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
3036; GFX9-NEXT:    s_endpgm
3037;
3038; GFX9-DL-LABEL: udot8_acc4_vecMul:
3039; GFX9-DL:       ; %bb.0: ; %entry
3040; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3041; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3042; GFX9-DL-NEXT:    s_mov_b32 s10, -1
3043; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
3044; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
3045; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3046; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3047; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3048; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
3049; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
3050; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3051; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
3052; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
3053; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3054; GFX9-DL-NEXT:    v_and_b32_e32 v0, 15, v2
3055; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3056; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v3
3057; GFX9-DL-NEXT:    v_bfe_u32 v4, v2, 4, 4
3058; GFX9-DL-NEXT:    v_bfe_u32 v11, v3, 4, 4
3059; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 8, 4
3060; GFX9-DL-NEXT:    v_bfe_u32 v12, v3, 8, 4
3061; GFX9-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
3062; GFX9-DL-NEXT:    v_bfe_u32 v13, v3, 12, 4
3063; GFX9-DL-NEXT:    v_bfe_u32 v7, v2, 16, 4
3064; GFX9-DL-NEXT:    v_bfe_u32 v14, v3, 16, 4
3065; GFX9-DL-NEXT:    v_bfe_u32 v8, v2, 20, 4
3066; GFX9-DL-NEXT:    v_bfe_u32 v9, v2, 24, 4
3067; GFX9-DL-NEXT:    v_bfe_u32 v15, v3, 20, 4
3068; GFX9-DL-NEXT:    v_bfe_u32 v16, v3, 24, 4
3069; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3070; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
3071; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
3072; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v3, v9, v16
3073; GFX9-DL-NEXT:    global_load_ubyte v9, v1, s[2:3]
3074; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v0, v0, v10
3075; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
3076; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
3077; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
3078; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
3079; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
3080; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3081; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v9
3082; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v4
3083; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v5
3084; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v6
3085; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v7
3086; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v8
3087; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v3
3088; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v2
3089; GFX9-DL-NEXT:    v_and_b32_e32 v0, 15, v0
3090; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
3091; GFX9-DL-NEXT:    s_endpgm
3092;
3093; GFX10-DL-LABEL: udot8_acc4_vecMul:
3094; GFX10-DL:       ; %bb.0: ; %entry
3095; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3096; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3097; GFX10-DL-NEXT:    s_mov_b32 s10, -1
3098; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
3099; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
3100; GFX10-DL-NEXT:    s_clause 0x1
3101; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3102; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3103; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3104; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
3105; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3106; GFX10-DL-NEXT:    s_clause 0x1
3107; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3108; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3109; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3110; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
3111; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
3112; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v1
3113; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3114; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
3115; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 4, 4
3116; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 4, 4
3117; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 8, 4
3118; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
3119; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 8, 4
3120; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v7
3121; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 12, 4
3122; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3123; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
3124; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 12, 4
3125; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v8
3126; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 16, 4
3127; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v6
3128; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 16, 4
3129; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v7
3130; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 20, 4
3131; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v5
3132; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
3133; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v8
3134; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 24, 4
3135; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3136; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
3137; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 24, 4
3138; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v7
3139; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
3140; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v6
3141; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v8
3142; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
3143; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v5
3144; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v4
3145; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
3146; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3147; GFX10-DL-NEXT:    global_store_byte v0, v1, s[2:3]
3148; GFX10-DL-NEXT:    s_endpgm
3149                                             <8 x i4> addrspace(1)* %src2,
3150                                             i4 addrspace(1)* nocapture %dst) {
3151entry:
3152  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3153  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3154  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3155  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3156  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3157
3158  %mul = mul <8 x i4> %vec1, %vec2
3159  %mul0 = extractelement <8 x i4> %mul, i64 0
3160  %mul1 = extractelement <8 x i4> %mul, i64 1
3161  %mul2 = extractelement <8 x i4> %mul, i64 2
3162  %mul3 = extractelement <8 x i4> %mul, i64 3
3163  %mul4 = extractelement <8 x i4> %mul, i64 4
3164  %mul5 = extractelement <8 x i4> %mul, i64 5
3165  %mul6 = extractelement <8 x i4> %mul, i64 6
3166  %mul7 = extractelement <8 x i4> %mul, i64 7
3167
3168  %acc = load i4, i4 addrspace(1)* %dst, align 4
3169  %add1 = add i4 %mul0, %acc
3170  %add2 = add i4 %add1, %mul1
3171  %add3 = add i4 %add2, %mul2
3172  %add4 = add i4 %add3, %mul3
3173  %add5 = add i4 %add4, %mul4
3174  %add6 = add i4 %add5, %mul5
3175  %add7 = add i4 %add6, %mul6
3176  %add8 = add i4 %add7, %mul7
3177
3178  store i4 %add8, i4 addrspace(1)* %dst, align 4
3179  ret void
3180}
3181
3182define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
3183; GFX7-LABEL: udot8_variant1:
3184; GFX7:       ; %bb.0: ; %entry
3185; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3186; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3187; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3188; GFX7-NEXT:    s_mov_b32 s10, 0
3189; GFX7-NEXT:    s_mov_b32 s11, s3
3190; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3191; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
3192; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3193; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3194; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
3195; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
3196; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
3197; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
3198; GFX7-NEXT:    s_mov_b32 s2, -1
3199; GFX7-NEXT:    s_waitcnt vmcnt(1)
3200; GFX7-NEXT:    v_and_b32_e32 v1, 15, v2
3201; GFX7-NEXT:    v_bfe_u32 v3, v2, 4, 4
3202; GFX7-NEXT:    s_waitcnt vmcnt(0)
3203; GFX7-NEXT:    v_and_b32_e32 v9, 15, v0
3204; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 4
3205; GFX7-NEXT:    v_bfe_u32 v5, v2, 12, 4
3206; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
3207; GFX7-NEXT:    v_bfe_u32 v7, v2, 20, 4
3208; GFX7-NEXT:    v_bfe_u32 v8, v2, 24, 4
3209; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3210; GFX7-NEXT:    v_bfe_u32 v10, v0, 4, 4
3211; GFX7-NEXT:    v_bfe_u32 v11, v0, 8, 4
3212; GFX7-NEXT:    v_bfe_u32 v12, v0, 12, 4
3213; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
3214; GFX7-NEXT:    v_bfe_u32 v14, v0, 20, 4
3215; GFX7-NEXT:    v_bfe_u32 v15, v0, 24, 4
3216; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
3217; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3218; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v1, s4
3219; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
3220; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v3, v0
3221; GFX7-NEXT:    v_mad_u32_u24 v0, v11, v4, v0
3222; GFX7-NEXT:    v_mad_u32_u24 v0, v12, v5, v0
3223; GFX7-NEXT:    v_mad_u32_u24 v0, v13, v6, v0
3224; GFX7-NEXT:    v_mad_u32_u24 v0, v14, v7, v0
3225; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v8, v0
3226; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3227; GFX7-NEXT:    s_endpgm
3228;
3229; GFX8-LABEL: udot8_variant1:
3230; GFX8:       ; %bb.0: ; %entry
3231; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3232; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3233; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3234; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3235; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3236; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
3237; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3238; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3239; GFX8-NEXT:    v_mov_b32_e32 v1, s7
3240; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
3241; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3242; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3243; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
3244; GFX8-NEXT:    s_waitcnt vmcnt(1)
3245; GFX8-NEXT:    v_and_b32_e32 v1, 15, v3
3246; GFX8-NEXT:    v_bfe_u32 v4, v3, 4, 4
3247; GFX8-NEXT:    v_bfe_u32 v6, v3, 8, 4
3248; GFX8-NEXT:    v_bfe_u32 v8, v3, 12, 4
3249; GFX8-NEXT:    s_waitcnt vmcnt(0)
3250; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
3251; GFX8-NEXT:    v_bfe_u32 v5, v0, 4, 4
3252; GFX8-NEXT:    v_bfe_u32 v7, v0, 8, 4
3253; GFX8-NEXT:    v_bfe_u32 v9, v0, 12, 4
3254; GFX8-NEXT:    v_bfe_u32 v10, v3, 16, 4
3255; GFX8-NEXT:    v_bfe_u32 v11, v0, 16, 4
3256; GFX8-NEXT:    v_bfe_u32 v12, v3, 20, 4
3257; GFX8-NEXT:    v_bfe_u32 v14, v3, 24, 4
3258; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
3259; GFX8-NEXT:    v_bfe_u32 v15, v0, 24, 4
3260; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
3261; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
3262; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3263; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, s2
3264; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v1
3265; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v4, v0
3266; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v6, v0
3267; GFX8-NEXT:    v_mad_u32_u24 v0, v9, v8, v0
3268; GFX8-NEXT:    v_mad_u32_u24 v0, v11, v10, v0
3269; GFX8-NEXT:    v_mad_u32_u24 v0, v13, v12, v0
3270; GFX8-NEXT:    v_mad_u32_u24 v2, v15, v14, v0
3271; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3272; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3273; GFX8-NEXT:    flat_store_dword v[0:1], v2
3274; GFX8-NEXT:    s_endpgm
3275;
3276; GFX9-LABEL: udot8_variant1:
3277; GFX9:       ; %bb.0: ; %entry
3278; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3279; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3280; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3281; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3282; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
3283; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
3284; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
3285; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3286; GFX9-NEXT:    s_waitcnt vmcnt(1)
3287; GFX9-NEXT:    v_and_b32_e32 v3, 15, v1
3288; GFX9-NEXT:    s_waitcnt vmcnt(0)
3289; GFX9-NEXT:    v_and_b32_e32 v4, 15, v2
3290; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
3291; GFX9-NEXT:    v_bfe_u32 v6, v2, 4, 4
3292; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
3293; GFX9-NEXT:    v_bfe_u32 v8, v2, 8, 4
3294; GFX9-NEXT:    v_bfe_u32 v9, v1, 12, 4
3295; GFX9-NEXT:    v_bfe_u32 v10, v2, 12, 4
3296; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
3297; GFX9-NEXT:    v_bfe_u32 v12, v2, 16, 4
3298; GFX9-NEXT:    v_bfe_u32 v13, v1, 20, 4
3299; GFX9-NEXT:    v_bfe_u32 v15, v1, 24, 4
3300; GFX9-NEXT:    v_bfe_u32 v14, v2, 20, 4
3301; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
3302; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
3303; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3304; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v4, v3
3305; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
3306; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v6, v5
3307; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v8, v7
3308; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3309; GFX9-NEXT:    v_add3_u32 v1, v3, s0, v1
3310; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v10, v9
3311; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v12, v11
3312; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v5
3313; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v14, v13
3314; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v16, v15
3315; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v7
3316; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v9
3317; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
3318; GFX9-NEXT:    s_endpgm
3319;
3320; GFX9-DL-LABEL: udot8_variant1:
3321; GFX9-DL:       ; %bb.0: ; %entry
3322; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3323; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3324; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3325; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
3326; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3327; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
3328; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
3329; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
3330; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3331; GFX9-DL-NEXT:    v_dot8_u32_u4 v0, v3, v2, s0
3332; GFX9-DL-NEXT:    global_store_dword v1, v0, s[2:3]
3333; GFX9-DL-NEXT:    s_endpgm
3334;
3335; GFX10-DL-LABEL: udot8_variant1:
3336; GFX10-DL:       ; %bb.0: ; %entry
3337; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3338; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3339; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3340; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3341; GFX10-DL-NEXT:    s_clause 0x1
3342; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3343; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3344; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3345; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
3346; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3347; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v2, v1, s2
3348; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
3349; GFX10-DL-NEXT:    s_endpgm
3350                                          i32 addrspace(1)* %v2addr,
3351                                          i32 addrspace(1)* %dst) {
3352entry:
3353  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3354  %gep1 = getelementptr i32, i32 addrspace(1)* %v1addr, i32 %idx
3355  %v1 = load i32, i32 addrspace(1)* %gep1, align 4
3356  %gep2 = getelementptr i32, i32 addrspace(1)* %v2addr, i32 %idx
3357  %v2 = load i32, i32 addrspace(1)* %gep2, align 4
3358  %and = and i32 %v1, 15
3359  %and1 = and i32 %v2, 15
3360  %mul1 = mul nuw nsw i32 %and1, %and
3361
3362  %shr = lshr i32 %v1, 4
3363  %and2 = and i32 %shr, 15
3364  %shr3 = lshr i32 %v2, 4
3365  %and4 = and i32 %shr3, 15
3366  %mul2 = mul nuw nsw i32 %and4, %and2
3367
3368  %shr6 = lshr i32 %v1, 8
3369  %and7 = and i32 %shr6, 15
3370  %shr8 = lshr i32 %v2, 8
3371  %and9 = and i32 %shr8, 15
3372  %mul3 = mul nuw nsw i32 %and9, %and7
3373
3374  %shr12 = lshr i32 %v1, 12
3375  %and13 = and i32 %shr12, 15
3376  %shr14 = lshr i32 %v2, 12
3377  %and15 = and i32 %shr14, 15
3378  %mul4 = mul nuw nsw i32 %and15, %and13
3379
3380  %shr18 = lshr i32 %v1, 16
3381  %and19 = and i32 %shr18, 15
3382  %shr20 = lshr i32 %v2, 16
3383  %and21 = and i32 %shr20, 15
3384  %mul5 = mul nuw nsw i32 %and21, %and19
3385
3386  %shr24 = lshr i32 %v1, 20
3387  %and25 = and i32 %shr24, 15
3388  %shr26 = lshr i32 %v2, 20
3389  %and27 = and i32 %shr26, 15
3390  %mul6 = mul nuw nsw i32 %and27, %and25
3391
3392  %shr30 = lshr i32 %v1, 24
3393  %and31 = and i32 %shr30, 15
3394  %shr32 = lshr i32 %v2, 24
3395  %and33 = and i32 %shr32, 15
3396  %mul7 = mul nuw nsw i32 %and33, %and31
3397
3398  %shr36 = lshr i32 %v1, 28
3399  %shr37 = lshr i32 %v2, 28
3400  %mul8 = mul nuw nsw i32 %shr37, %shr36
3401  %acc = load i32, i32 addrspace(1)* %dst, align 4
3402
3403  %add1 = add i32 %mul1, %acc
3404  %add2 = add i32 %add1, %mul8
3405  %add3 = add i32 %add2, %mul2
3406  %add4 = add i32 %add3, %mul3
3407  %add5 = add i32 %add4, %mul4
3408  %add6 = add i32 %add5, %mul5
3409  %add7 = add i32 %add6, %mul6
3410  %add8 = add i32 %add7, %mul7
3411  store i32 %add8, i32 addrspace(1)* %dst, align 4
3412  ret void
3413}
3414
3415declare i32 @llvm.amdgcn.workitem.id.x()
3416