1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
3; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
7; GFX906-LABEL: v_udot8:
8; GFX906:       ; %bb.0:
9; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
11; GFX906-NEXT:    s_setpc_b64 s[30:31]
12;
13; GFX10-LABEL: v_udot8:
14; GFX10:       ; %bb.0:
15; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
17; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
18; GFX10-NEXT:    s_setpc_b64 s[30:31]
19  %r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 false)
20  ret i32 %r
21}
22
23define i32 @v_udot8_clamp(i32 %a, i32 %b, i32 %c) {
24; GFX906-LABEL: v_udot8_clamp:
25; GFX906:       ; %bb.0:
26; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 clamp
28; GFX906-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX10-LABEL: v_udot8_clamp:
31; GFX10:       ; %bb.0:
32; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
34; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 clamp
35; GFX10-NEXT:    s_setpc_b64 s[30:31]
36  %r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 true)
37  ret i32 %r
38}
39
40; FIXME: Fix argument do not let these casts expand
41; define i32 @v_udot8_cast_v8i4(<8 x i4> %a, <8 x i4> %b, i32 %c) {
42;   %a.cast = bitcast <8 x i4> %a to i32
43;   %b.cast = bitcast <8 x i4> %b to i32
44;   %r = call i32 @llvm.amdgcn.udot8(i32 %a.cast, i32 %b.cast, i32 %c, i1 false)
45;   ret i32 %r
46; }
47
48define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
49; GFX906-LABEL: v_udot8_fnegf32_a:
50; GFX906:       ; %bb.0:
51; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
53; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
54; GFX906-NEXT:    s_setpc_b64 s[30:31]
55;
56; GFX10-LABEL: v_udot8_fnegf32_a:
57; GFX10:       ; %bb.0:
58; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
60; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
61; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
62; GFX10-NEXT:    s_setpc_b64 s[30:31]
63  %neg.a = fneg float %a
64  %cast.neg.a = bitcast float %neg.a to i32
65  %r = call i32 @llvm.amdgcn.udot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
66  ret i32 %r
67}
68
69define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
70; GFX906-LABEL: v_udot8_fnegv2f16_a:
71; GFX906:       ; %bb.0:
72; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
74; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
75; GFX906-NEXT:    s_setpc_b64 s[30:31]
76;
77; GFX10-LABEL: v_udot8_fnegv2f16_a:
78; GFX10:       ; %bb.0:
79; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
81; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
82; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
83; GFX10-NEXT:    s_setpc_b64 s[30:31]
84  %neg.a = fneg <2 x half> %a
85  %cast.neg.a = bitcast <2 x half> %neg.a to i32
86  %r = call i32 @llvm.amdgcn.udot8(i32 %cast.neg.a, i32 %b, i32 %c, i1 false)
87  ret i32 %r
88}
89
90declare i32 @llvm.amdgcn.udot8(i32, i32, i32, i1 immarg) #0
91
92attributes #0 = { nounwind readnone speculatable }
93