1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=CM %s
3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s
4
5define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
6; CM-LABEL: test_umul24_i32:
7; CM:       ; %bb.0: ; %entry
8; CM-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
9; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
10; CM-NEXT:    CF_END
11; CM-NEXT:    PAD
12; CM-NEXT:    ALU clause starting at 4:
13; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
14; CM-NEXT:     AND_INT T0.Z, KC0[2].W, literal.y,
15; CM-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.y,
16; CM-NEXT:    2(2.802597e-45), 16777215(2.350989e-38)
17; CM-NEXT:     MULLO_INT T1.X, T0.W, T0.Z,
18; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
19; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
20; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
21;
22; EG-LABEL: test_umul24_i32:
23; EG:       ; %bb.0: ; %entry
24; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
25; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
26; EG-NEXT:    CF_END
27; EG-NEXT:    PAD
28; EG-NEXT:    ALU clause starting at 4:
29; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
30; EG-NEXT:     AND_INT * T1.W, KC0[2].Z, literal.x,
31; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
32; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
33; EG-NEXT:     MULLO_INT * T1.X, PS, PV.W,
34; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
35entry:
36  %0 = shl i32 %a, 8
37  %a_24 = lshr i32 %0, 8
38  %1 = shl i32 %b, 8
39  %b_24 = lshr i32 %1, 8
40  %2 = mul i32 %a_24, %b_24
41  store i32 %2, i32 addrspace(1)* %out
42  ret void
43}
44
45; The result must be sign-extended.
46define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
47; CM-LABEL: test_umul24_i16_sext:
48; CM:       ; %bb.0: ; %entry
49; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
50; CM-NEXT:    TEX 1 @6
51; CM-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
52; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
53; CM-NEXT:    CF_END
54; CM-NEXT:    PAD
55; CM-NEXT:    Fetch clause starting at 6:
56; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
57; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 42, #3
58; CM-NEXT:    ALU clause starting at 10:
59; CM-NEXT:     MOV * T0.X, 0.0,
60; CM-NEXT:    ALU clause starting at 11:
61; CM-NEXT:     MULLO_INT T0.X, T1.X, T0.X,
62; CM-NEXT:     MULLO_INT T0.Y (MASKED), T1.X, T0.X,
63; CM-NEXT:     MULLO_INT T0.Z (MASKED), T1.X, T0.X,
64; CM-NEXT:     MULLO_INT * T0.W (MASKED), T1.X, T0.X,
65; CM-NEXT:     BFE_INT * T0.X, PV.X, 0.0, literal.x,
66; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
67; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
68; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
69;
70; EG-LABEL: test_umul24_i16_sext:
71; EG:       ; %bb.0: ; %entry
72; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
73; EG-NEXT:    TEX 1 @6
74; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
75; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
76; EG-NEXT:    CF_END
77; EG-NEXT:    PAD
78; EG-NEXT:    Fetch clause starting at 6:
79; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
80; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 42, #3
81; EG-NEXT:    ALU clause starting at 10:
82; EG-NEXT:     MOV * T0.X, 0.0,
83; EG-NEXT:    ALU clause starting at 11:
84; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
85; EG-NEXT:     BFE_INT T0.X, PS, 0.0, literal.x,
86; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
87; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
88entry:
89  %mul = mul i16 %a, %b
90  %ext = sext i16 %mul to i32
91  store i32 %ext, i32 addrspace(1)* %out
92  ret void
93}
94
95; The result must be sign-extended.
96define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
97; CM-LABEL: test_umul24_i8:
98; CM:       ; %bb.0: ; %entry
99; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
100; CM-NEXT:    TEX 1 @6
101; CM-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
102; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
103; CM-NEXT:    CF_END
104; CM-NEXT:    PAD
105; CM-NEXT:    Fetch clause starting at 6:
106; CM-NEXT:     VTX_READ_8 T1.X, T0.X, 40, #3
107; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 41, #3
108; CM-NEXT:    ALU clause starting at 10:
109; CM-NEXT:     MOV * T0.X, 0.0,
110; CM-NEXT:    ALU clause starting at 11:
111; CM-NEXT:     MULLO_INT T0.X, T1.X, T0.X,
112; CM-NEXT:     MULLO_INT T0.Y (MASKED), T1.X, T0.X,
113; CM-NEXT:     MULLO_INT T0.Z (MASKED), T1.X, T0.X,
114; CM-NEXT:     MULLO_INT * T0.W (MASKED), T1.X, T0.X,
115; CM-NEXT:     BFE_INT * T0.X, PV.X, 0.0, literal.x,
116; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
117; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
118; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
119;
120; EG-LABEL: test_umul24_i8:
121; EG:       ; %bb.0: ; %entry
122; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
123; EG-NEXT:    TEX 1 @6
124; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
125; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
126; EG-NEXT:    CF_END
127; EG-NEXT:    PAD
128; EG-NEXT:    Fetch clause starting at 6:
129; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 40, #3
130; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 41, #3
131; EG-NEXT:    ALU clause starting at 10:
132; EG-NEXT:     MOV * T0.X, 0.0,
133; EG-NEXT:    ALU clause starting at 11:
134; EG-NEXT:     MULLO_INT * T0.X, T1.X, T0.X,
135; EG-NEXT:     BFE_INT T0.X, PS, 0.0, literal.x,
136; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
137; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
138entry:
139  %mul = mul i8 %a, %b
140  %ext = sext i8 %mul to i32
141  store i32 %ext, i32 addrspace(1)* %out
142  ret void
143}
144
145define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
146; CM-LABEL: test_umulhi24_i32_i64:
147; CM:       ; %bb.0: ; %entry
148; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
149; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
150; CM-NEXT:    CF_END
151; CM-NEXT:    PAD
152; CM-NEXT:    ALU clause starting at 4:
153; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
154; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
155; CM-NEXT:     MULHI_UINT24 T1.X, KC0[2].Z, KC0[2].W,
156; CM-NEXT:     MULHI_UINT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
157; CM-NEXT:     MULHI_UINT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
158; CM-NEXT:     MULHI_UINT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
159;
160; EG-LABEL: test_umulhi24_i32_i64:
161; EG:       ; %bb.0: ; %entry
162; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
163; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
164; EG-NEXT:    CF_END
165; EG-NEXT:    PAD
166; EG-NEXT:    ALU clause starting at 4:
167; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
168; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
169; EG-NEXT:     MULHI_UINT24 * T1.X, KC0[2].Z, KC0[2].W,
170entry:
171  %a.24 = and i32 %a, 16777215
172  %b.24 = and i32 %b, 16777215
173  %a.24.i64 = zext i32 %a.24 to i64
174  %b.24.i64 = zext i32 %b.24 to i64
175  %mul48 = mul i64 %a.24.i64, %b.24.i64
176  %mul48.hi = lshr i64 %mul48, 32
177  %mul24hi = trunc i64 %mul48.hi to i32
178  store i32 %mul24hi, i32 addrspace(1)* %out
179  ret void
180}
181
182define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
183; CM-LABEL: test_umulhi24:
184; CM:       ; %bb.0: ; %entry
185; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
186; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
187; CM-NEXT:    CF_END
188; CM-NEXT:    PAD
189; CM-NEXT:    ALU clause starting at 4:
190; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
191; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
192; CM-NEXT:     MULHI_UINT24 T1.X, KC0[2].W, KC0[3].Y,
193; CM-NEXT:     MULHI_UINT24 T1.Y (MASKED), KC0[2].W, KC0[3].Y,
194; CM-NEXT:     MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
195; CM-NEXT:     MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
196;
197; EG-LABEL: test_umulhi24:
198; EG:       ; %bb.0: ; %entry
199; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
200; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
201; EG-NEXT:    CF_END
202; EG-NEXT:    PAD
203; EG-NEXT:    ALU clause starting at 4:
204; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
205; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
206; EG-NEXT:     MULHI_UINT24 * T1.X, KC0[2].W, KC0[3].Y,
207entry:
208  %a.24 = and i64 %a, 16777215
209  %b.24 = and i64 %b, 16777215
210  %mul48 = mul i64 %a.24, %b.24
211  %mul48.hi = lshr i64 %mul48, 32
212  %mul24.hi = trunc i64 %mul48.hi to i32
213  store i32 %mul24.hi, i32 addrspace(1)* %out
214  ret void
215}
216
217; Multiply with 24-bit inputs and 64-bit output.
218define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
219; CM-LABEL: test_umul24_i64:
220; CM:       ; %bb.0: ; %entry
221; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
222; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
223; CM-NEXT:    CF_END
224; CM-NEXT:    PAD
225; CM-NEXT:    ALU clause starting at 4:
226; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
227; CM-NEXT:     AND_INT * T0.Z, KC0[3].Y, literal.y,
228; CM-NEXT:    2(2.802597e-45), 16777215(2.350989e-38)
229; CM-NEXT:     AND_INT * T0.W, KC0[2].W, literal.x,
230; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
231; CM-NEXT:     MULLO_INT T1.X, T0.W, T0.Z,
232; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T0.Z,
233; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T0.Z,
234; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T0.Z,
235; CM-NEXT:     MULHI_UINT24 T1.X (MASKED), KC0[2].W, KC0[3].Y,
236; CM-NEXT:     MULHI_UINT24 T1.Y, KC0[2].W, KC0[3].Y,
237; CM-NEXT:     MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y,
238; CM-NEXT:     MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y,
239;
240; EG-LABEL: test_umul24_i64:
241; EG:       ; %bb.0: ; %entry
242; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
243; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
244; EG-NEXT:    CF_END
245; EG-NEXT:    PAD
246; EG-NEXT:    ALU clause starting at 4:
247; EG-NEXT:     AND_INT T0.W, KC0[3].Y, literal.x,
248; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
249; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
250; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
251; EG-NEXT:     MULLO_INT * T1.X, PS, PV.W,
252; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
253; EG-NEXT:     MULHI_UINT24 * T1.Y, KC0[2].W, KC0[3].Y,
254entry:
255  %tmp0 = shl i64 %a, 40
256  %a_24 = lshr i64 %tmp0, 40
257  %tmp1 = shl i64 %b, 40
258  %b_24 = lshr i64 %tmp1, 40
259  %tmp2 = mul i64 %a_24, %b_24
260  store i64 %tmp2, i64 addrspace(1)* %out
261  ret void
262}
263