1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
3; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM %s
4
5; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll,
6; but with all 64-bit tests, and tests with loads dropped.
7
8; Patterns:
9;   a) x &  (1 << nbits) - 1
10;   b) x & ~(-1 << nbits)
11;   c) x &  (-1 >> (32 - y))
12;   d) x << (32 - y) >> (32 - y)
13; are equivalent.
14
15; ---------------------------------------------------------------------------- ;
16; Pattern a. 32-bit
17; ---------------------------------------------------------------------------- ;
18
19define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
20; EG-LABEL: bzhi32_a0:
21; EG:       ; %bb.0:
22; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
23; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
24; EG-NEXT:    CF_END
25; EG-NEXT:    PAD
26; EG-NEXT:    ALU clause starting at 4:
27; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
28; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
29; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
30;
31; CM-LABEL: bzhi32_a0:
32; CM:       ; %bb.0:
33; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
34; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
35; CM-NEXT:    CF_END
36; CM-NEXT:    PAD
37; CM-NEXT:    ALU clause starting at 4:
38; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
39; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
40; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
41  %onebit = shl i32 1, %numlowbits
42  %mask = add nsw i32 %onebit, -1
43  %masked = and i32 %mask, %val
44  store i32 %masked, i32 addrspace(1)* %out
45  ret void
46}
47
48define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
49; EG-LABEL: bzhi32_a1_indexzext:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
52; EG-NEXT:    TEX 0 @6
53; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
54; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
55; EG-NEXT:    CF_END
56; EG-NEXT:    PAD
57; EG-NEXT:    Fetch clause starting at 6:
58; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
59; EG-NEXT:    ALU clause starting at 8:
60; EG-NEXT:     MOV * T0.X, 0.0,
61; EG-NEXT:    ALU clause starting at 9:
62; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
63; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
64; EG-NEXT:     BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
65; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
66; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
67;
68; CM-LABEL: bzhi32_a1_indexzext:
69; CM:       ; %bb.0:
70; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
71; CM-NEXT:    TEX 0 @6
72; CM-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
73; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
74; CM-NEXT:    CF_END
75; CM-NEXT:    PAD
76; CM-NEXT:    Fetch clause starting at 6:
77; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
78; CM-NEXT:    ALU clause starting at 8:
79; CM-NEXT:     MOV * T0.X, 0.0,
80; CM-NEXT:    ALU clause starting at 9:
81; CM-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
82; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
83; CM-NEXT:     BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
84; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
85; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
86  %conv = zext i8 %numlowbits to i32
87  %onebit = shl i32 1, %conv
88  %mask = add nsw i32 %onebit, -1
89  %masked = and i32 %mask, %val
90  store i32 %masked, i32 addrspace(1)* %out
91  ret void
92}
93
94define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
95; EG-LABEL: bzhi32_a4_commutative:
96; EG:       ; %bb.0:
97; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
98; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
99; EG-NEXT:    CF_END
100; EG-NEXT:    PAD
101; EG-NEXT:    ALU clause starting at 4:
102; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
103; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
104; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
105;
106; CM-LABEL: bzhi32_a4_commutative:
107; CM:       ; %bb.0:
108; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
109; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
110; CM-NEXT:    CF_END
111; CM-NEXT:    PAD
112; CM-NEXT:    ALU clause starting at 4:
113; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
114; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
115; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
116  %onebit = shl i32 1, %numlowbits
117  %mask = add nsw i32 %onebit, -1
118  %masked = and i32 %val, %mask ; swapped order
119  store i32 %masked, i32 addrspace(1)* %out
120  ret void
121}
122
123; ---------------------------------------------------------------------------- ;
124; Pattern b. 32-bit
125; ---------------------------------------------------------------------------- ;
126
127define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
128; EG-LABEL: bzhi32_b0:
129; EG:       ; %bb.0:
130; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
131; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
132; EG-NEXT:    CF_END
133; EG-NEXT:    PAD
134; EG-NEXT:    ALU clause starting at 4:
135; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
136; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
138;
139; CM-LABEL: bzhi32_b0:
140; CM:       ; %bb.0:
141; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
142; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
143; CM-NEXT:    CF_END
144; CM-NEXT:    PAD
145; CM-NEXT:    ALU clause starting at 4:
146; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
147; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
148; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
149  %notmask = shl i32 -1, %numlowbits
150  %mask = xor i32 %notmask, -1
151  %masked = and i32 %mask, %val
152  store i32 %masked, i32 addrspace(1)* %out
153  ret void
154}
155
156define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
157; EG-LABEL: bzhi32_b1_indexzext:
158; EG:       ; %bb.0:
159; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
160; EG-NEXT:    TEX 0 @6
161; EG-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
162; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
163; EG-NEXT:    CF_END
164; EG-NEXT:    PAD
165; EG-NEXT:    Fetch clause starting at 6:
166; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
167; EG-NEXT:    ALU clause starting at 8:
168; EG-NEXT:     MOV * T0.X, 0.0,
169; EG-NEXT:    ALU clause starting at 9:
170; EG-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
171; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
172; EG-NEXT:     BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
173; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
174; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
175;
176; CM-LABEL: bzhi32_b1_indexzext:
177; CM:       ; %bb.0:
178; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
179; CM-NEXT:    TEX 0 @6
180; CM-NEXT:    ALU 4, @9, KC0[CB0:0-32], KC1[]
181; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
182; CM-NEXT:    CF_END
183; CM-NEXT:    PAD
184; CM-NEXT:    Fetch clause starting at 6:
185; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
186; CM-NEXT:    ALU clause starting at 8:
187; CM-NEXT:     MOV * T0.X, 0.0,
188; CM-NEXT:    ALU clause starting at 9:
189; CM-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x,
190; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
191; CM-NEXT:     BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
192; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
193; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
194  %conv = zext i8 %numlowbits to i32
195  %notmask = shl i32 -1, %conv
196  %mask = xor i32 %notmask, -1
197  %masked = and i32 %mask, %val
198  store i32 %masked, i32 addrspace(1)* %out
199  ret void
200}
201
202define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
203; EG-LABEL: bzhi32_b4_commutative:
204; EG:       ; %bb.0:
205; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
206; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
207; EG-NEXT:    CF_END
208; EG-NEXT:    PAD
209; EG-NEXT:    ALU clause starting at 4:
210; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
211; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
212; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
213;
214; CM-LABEL: bzhi32_b4_commutative:
215; CM:       ; %bb.0:
216; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
217; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
218; CM-NEXT:    CF_END
219; CM-NEXT:    PAD
220; CM-NEXT:    ALU clause starting at 4:
221; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
222; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
223; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
224  %notmask = shl i32 -1, %numlowbits
225  %mask = xor i32 %notmask, -1
226  %masked = and i32 %val, %mask ; swapped order
227  store i32 %masked, i32 addrspace(1)* %out
228  ret void
229}
230
231; ---------------------------------------------------------------------------- ;
232; Pattern c. 32-bit
233; ---------------------------------------------------------------------------- ;
234
235define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
236; EG-LABEL: bzhi32_c0:
237; EG:       ; %bb.0:
238; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
239; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
240; EG-NEXT:    CF_END
241; EG-NEXT:    PAD
242; EG-NEXT:    ALU clause starting at 4:
243; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
244; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
245; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
246;
247; CM-LABEL: bzhi32_c0:
248; CM:       ; %bb.0:
249; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
250; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
251; CM-NEXT:    CF_END
252; CM-NEXT:    PAD
253; CM-NEXT:    ALU clause starting at 4:
254; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
255; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
256; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
257  %numhighbits = sub i32 32, %numlowbits
258  %mask = lshr i32 -1, %numhighbits
259  %masked = and i32 %mask, %val
260  store i32 %masked, i32 addrspace(1)* %out
261  ret void
262}
263
264define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
265; EG-LABEL: bzhi32_c1_indexzext:
266; EG:       ; %bb.0:
267; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
268; EG-NEXT:    TEX 0 @6
269; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
270; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
271; EG-NEXT:    CF_END
272; EG-NEXT:    PAD
273; EG-NEXT:    Fetch clause starting at 6:
274; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
275; EG-NEXT:    ALU clause starting at 8:
276; EG-NEXT:     MOV * T0.X, 0.0,
277; EG-NEXT:    ALU clause starting at 9:
278; EG-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
279; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
280; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
281; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
282; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
283; EG-NEXT:    -1(nan), 0(0.000000e+00)
284; EG-NEXT:     AND_INT T0.X, PV.W, KC0[2].Y,
285; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
286; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
287;
288; CM-LABEL: bzhi32_c1_indexzext:
289; CM:       ; %bb.0:
290; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
291; CM-NEXT:    TEX 0 @6
292; CM-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
293; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
294; CM-NEXT:    CF_END
295; CM-NEXT:    PAD
296; CM-NEXT:    Fetch clause starting at 6:
297; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
298; CM-NEXT:    ALU clause starting at 8:
299; CM-NEXT:     MOV * T0.X, 0.0,
300; CM-NEXT:    ALU clause starting at 9:
301; CM-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
302; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
303; CM-NEXT:     AND_INT * T0.W, PV.W, literal.x,
304; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
305; CM-NEXT:     LSHR * T0.W, literal.x, PV.W,
306; CM-NEXT:    -1(nan), 0(0.000000e+00)
307; CM-NEXT:     AND_INT * T0.X, PV.W, KC0[2].Y,
308; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
309; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
310  %numhighbits = sub i8 32, %numlowbits
311  %sh_prom = zext i8 %numhighbits to i32
312  %mask = lshr i32 -1, %sh_prom
313  %masked = and i32 %mask, %val
314  store i32 %masked, i32 addrspace(1)* %out
315  ret void
316}
317
318define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
319; EG-LABEL: bzhi32_c4_commutative:
320; EG:       ; %bb.0:
321; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
322; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
323; EG-NEXT:    CF_END
324; EG-NEXT:    PAD
325; EG-NEXT:    ALU clause starting at 4:
326; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
327; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
328; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
329;
330; CM-LABEL: bzhi32_c4_commutative:
331; CM:       ; %bb.0:
332; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
333; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
334; CM-NEXT:    CF_END
335; CM-NEXT:    PAD
336; CM-NEXT:    ALU clause starting at 4:
337; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
338; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
339; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
340  %numhighbits = sub i32 32, %numlowbits
341  %mask = lshr i32 -1, %numhighbits
342  %masked = and i32 %val, %mask ; swapped order
343  store i32 %masked, i32 addrspace(1)* %out
344  ret void
345}
346
347; ---------------------------------------------------------------------------- ;
348; Pattern d. 32-bit.
349; ---------------------------------------------------------------------------- ;
350
351define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
352; EG-LABEL: bzhi32_d0:
353; EG:       ; %bb.0:
354; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
355; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
356; EG-NEXT:    CF_END
357; EG-NEXT:    PAD
358; EG-NEXT:    ALU clause starting at 4:
359; EG-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
360; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
361; EG-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
362;
363; CM-LABEL: bzhi32_d0:
364; CM:       ; %bb.0:
365; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
366; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
367; CM-NEXT:    CF_END
368; CM-NEXT:    PAD
369; CM-NEXT:    ALU clause starting at 4:
370; CM-NEXT:     LSHR * T0.X, KC0[2].W, literal.x,
371; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
372; CM-NEXT:     BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
373  %numhighbits = sub i32 32, %numlowbits
374  %highbitscleared = shl i32 %val, %numhighbits
375  %masked = lshr i32 %highbitscleared, %numhighbits
376  store i32 %masked, i32 addrspace(1)* %out
377  ret void
378}
379
380define amdgpu_kernel void @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
381; EG-LABEL: bzhi32_d1_indexzext:
382; EG:       ; %bb.0:
383; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
384; EG-NEXT:    TEX 0 @6
385; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
386; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
387; EG-NEXT:    CF_END
388; EG-NEXT:    PAD
389; EG-NEXT:    Fetch clause starting at 6:
390; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
391; EG-NEXT:    ALU clause starting at 8:
392; EG-NEXT:     MOV * T0.X, 0.0,
393; EG-NEXT:    ALU clause starting at 9:
394; EG-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
395; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
396; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
397; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
398; EG-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
399; EG-NEXT:     LSHR T0.X, PV.W, T0.W,
400; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
401; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
402;
403; CM-LABEL: bzhi32_d1_indexzext:
404; CM:       ; %bb.0:
405; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
406; CM-NEXT:    TEX 0 @6
407; CM-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
408; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
409; CM-NEXT:    CF_END
410; CM-NEXT:    PAD
411; CM-NEXT:    Fetch clause starting at 6:
412; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
413; CM-NEXT:    ALU clause starting at 8:
414; CM-NEXT:     MOV * T0.X, 0.0,
415; CM-NEXT:    ALU clause starting at 9:
416; CM-NEXT:     SUB_INT * T0.W, literal.x, T0.X,
417; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
418; CM-NEXT:     AND_INT * T0.W, PV.W, literal.x,
419; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
420; CM-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
421; CM-NEXT:     LSHR * T0.X, PV.W, T0.W,
422; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
423; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
424  %numhighbits = sub i8 32, %numlowbits
425  %sh_prom = zext i8 %numhighbits to i32
426  %highbitscleared = shl i32 %val, %sh_prom
427  %masked = lshr i32 %highbitscleared, %sh_prom
428  store i32 %masked, i32 addrspace(1)* %out
429  ret void
430}
431