1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s 6 7define amdgpu_ps void @static_exact(float %arg0, float %arg1) { 8; SI-LABEL: static_exact: 9; SI: ; %bb.0: ; %.entry 10; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 11; SI-NEXT: s_andn2_b64 exec, exec, exec 12; SI-NEXT: s_cbranch_scc0 BB0_2 13; SI-NEXT: ; %bb.1: ; %.entry 14; SI-NEXT: s_mov_b64 exec, 0 15; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 16; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 17; SI-NEXT: s_endpgm 18; SI-NEXT: BB0_2: 19; SI-NEXT: s_mov_b64 exec, 0 20; SI-NEXT: exp null off, off, off, off done vm 21; SI-NEXT: s_endpgm 22; 23; GFX9-LABEL: static_exact: 24; GFX9: ; %bb.0: ; %.entry 25; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 26; GFX9-NEXT: s_andn2_b64 exec, exec, exec 27; GFX9-NEXT: s_cbranch_scc0 BB0_2 28; GFX9-NEXT: ; %bb.1: ; %.entry 29; GFX9-NEXT: s_mov_b64 exec, 0 30; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 31; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 32; GFX9-NEXT: s_endpgm 33; GFX9-NEXT: BB0_2: 34; GFX9-NEXT: s_mov_b64 exec, 0 35; GFX9-NEXT: exp null off, off, off, off done vm 36; GFX9-NEXT: s_endpgm 37; 38; GFX10-32-LABEL: static_exact: 39; GFX10-32: ; %bb.0: ; %.entry 40; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 41; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo 42; GFX10-32-NEXT: s_cbranch_scc0 BB0_2 43; GFX10-32-NEXT: ; %bb.1: ; %.entry 44; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 45; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 46; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 47; GFX10-32-NEXT: s_endpgm 48; GFX10-32-NEXT: BB0_2: 49; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 50; GFX10-32-NEXT: exp null off, off, off, off done vm 51; GFX10-32-NEXT: s_endpgm 52; 53; GFX10-64-LABEL: static_exact: 54; GFX10-64: ; %bb.0: ; %.entry 55; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 56; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec 57; GFX10-64-NEXT: s_cbranch_scc0 BB0_2 58; GFX10-64-NEXT: ; %bb.1: ; %.entry 59; GFX10-64-NEXT: s_mov_b64 exec, 0 60; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 61; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 62; GFX10-64-NEXT: s_endpgm 63; GFX10-64-NEXT: BB0_2: 64; GFX10-64-NEXT: s_mov_b64 exec, 0 65; GFX10-64-NEXT: exp null off, off, off, off done vm 66; GFX10-64-NEXT: s_endpgm 67.entry: 68 %c0 = fcmp olt float %arg0, 0.000000e+00 69 %c1 = fcmp oge float %arg1, 0.0 70 call void @llvm.amdgcn.wqm.demote(i1 false) 71 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 72 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 73 ret void 74} 75 76define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { 77; SI-LABEL: dynamic_exact: 78; SI: ; %bb.0: ; %.entry 79; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 80; SI-NEXT: s_mov_b64 s[2:3], exec 81; SI-NEXT: s_xor_b64 s[0:1], s[0:1], exec 82; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 83; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 84; SI-NEXT: s_cbranch_scc0 BB1_2 85; SI-NEXT: ; %bb.1: ; %.entry 86; SI-NEXT: s_and_b64 exec, exec, s[2:3] 87; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 88; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 89; SI-NEXT: s_endpgm 90; SI-NEXT: BB1_2: 91; SI-NEXT: s_mov_b64 exec, 0 92; SI-NEXT: exp null off, off, off, off done vm 93; SI-NEXT: s_endpgm 94; 95; GFX9-LABEL: dynamic_exact: 96; GFX9: ; %bb.0: ; %.entry 97; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 98; GFX9-NEXT: s_mov_b64 s[2:3], exec 99; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], exec 100; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 101; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 102; GFX9-NEXT: s_cbranch_scc0 BB1_2 103; GFX9-NEXT: ; %bb.1: ; %.entry 104; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] 105; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 106; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 107; GFX9-NEXT: s_endpgm 108; GFX9-NEXT: BB1_2: 109; GFX9-NEXT: s_mov_b64 exec, 0 110; GFX9-NEXT: exp null off, off, off, off done vm 111; GFX9-NEXT: s_endpgm 112; 113; GFX10-32-LABEL: dynamic_exact: 114; GFX10-32: ; %bb.0: ; %.entry 115; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 116; GFX10-32-NEXT: s_mov_b32 s1, exec_lo 117; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 118; GFX10-32-NEXT: s_xor_b32 s0, s0, exec_lo 119; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 120; GFX10-32-NEXT: s_cbranch_scc0 BB1_2 121; GFX10-32-NEXT: ; %bb.1: ; %.entry 122; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 123; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 124; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 125; GFX10-32-NEXT: s_endpgm 126; GFX10-32-NEXT: BB1_2: 127; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 128; GFX10-32-NEXT: exp null off, off, off, off done vm 129; GFX10-32-NEXT: s_endpgm 130; 131; GFX10-64-LABEL: dynamic_exact: 132; GFX10-64: ; %bb.0: ; %.entry 133; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 134; GFX10-64-NEXT: s_mov_b64 s[2:3], exec 135; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 136; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], exec 137; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 138; GFX10-64-NEXT: s_cbranch_scc0 BB1_2 139; GFX10-64-NEXT: ; %bb.1: ; %.entry 140; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] 141; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 142; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 143; GFX10-64-NEXT: s_endpgm 144; GFX10-64-NEXT: BB1_2: 145; GFX10-64-NEXT: s_mov_b64 exec, 0 146; GFX10-64-NEXT: exp null off, off, off, off done vm 147; GFX10-64-NEXT: s_endpgm 148.entry: 149 %c0 = fcmp olt float %arg0, 0.000000e+00 150 %c1 = fcmp oge float %arg1, 0.0 151 call void @llvm.amdgcn.wqm.demote(i1 %c1) 152 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 153 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 154 ret void 155} 156 157define amdgpu_ps void @branch(float %arg0, float %arg1) { 158; SI-LABEL: branch: 159; SI: ; %bb.0: ; %.entry 160; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 161; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 162; SI-NEXT: s_mov_b64 s[2:3], exec 163; SI-NEXT: v_or_b32_e32 v0, v0, v1 164; SI-NEXT: v_and_b32_e32 v1, 1, v0 165; SI-NEXT: v_and_b32_e32 v0, 1, v0 166; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 167; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 168; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] 169; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] 170; SI-NEXT: s_cbranch_execz BB2_3 171; SI-NEXT: ; %bb.1: ; %.demote 172; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec 173; SI-NEXT: s_cbranch_scc0 BB2_4 174; SI-NEXT: ; %bb.2: ; %.demote 175; SI-NEXT: s_mov_b64 exec, 0 176; SI-NEXT: BB2_3: ; %.continue 177; SI-NEXT: s_or_b64 exec, exec, s[0:1] 178; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 179; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 180; SI-NEXT: s_endpgm 181; SI-NEXT: BB2_4: 182; SI-NEXT: s_mov_b64 exec, 0 183; SI-NEXT: exp null off, off, off, off done vm 184; SI-NEXT: s_endpgm 185; 186; GFX9-LABEL: branch: 187; GFX9: ; %bb.0: ; %.entry 188; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 189; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 190; GFX9-NEXT: s_mov_b64 s[2:3], exec 191; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 192; GFX9-NEXT: v_and_b32_e32 v1, 1, v0 193; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 194; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 195; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 196; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] 197; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] 198; GFX9-NEXT: s_cbranch_execz BB2_3 199; GFX9-NEXT: ; %bb.1: ; %.demote 200; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec 201; GFX9-NEXT: s_cbranch_scc0 BB2_4 202; GFX9-NEXT: ; %bb.2: ; %.demote 203; GFX9-NEXT: s_mov_b64 exec, 0 204; GFX9-NEXT: BB2_3: ; %.continue 205; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 206; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 207; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 208; GFX9-NEXT: s_endpgm 209; GFX9-NEXT: BB2_4: 210; GFX9-NEXT: s_mov_b64 exec, 0 211; GFX9-NEXT: exp null off, off, off, off done vm 212; GFX9-NEXT: s_endpgm 213; 214; GFX10-32-LABEL: branch: 215; GFX10-32: ; %bb.0: ; %.entry 216; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 217; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 218; GFX10-32-NEXT: s_mov_b32 s1, exec_lo 219; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 220; GFX10-32-NEXT: v_and_b32_e32 v1, 1, v0 221; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 222; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 223; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 224; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 225; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 226; GFX10-32-NEXT: s_cbranch_execz BB2_3 227; GFX10-32-NEXT: ; %bb.1: ; %.demote 228; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo 229; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 230; GFX10-32-NEXT: ; %bb.2: ; %.demote 231; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 232; GFX10-32-NEXT: BB2_3: ; %.continue 233; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 234; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 235; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 236; GFX10-32-NEXT: s_endpgm 237; GFX10-32-NEXT: BB2_4: 238; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 239; GFX10-32-NEXT: exp null off, off, off, off done vm 240; GFX10-32-NEXT: s_endpgm 241; 242; GFX10-64-LABEL: branch: 243; GFX10-64: ; %bb.0: ; %.entry 244; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 245; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 246; GFX10-64-NEXT: s_mov_b64 s[2:3], exec 247; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 248; GFX10-64-NEXT: v_and_b32_e32 v1, 1, v0 249; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 250; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 251; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 252; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] 253; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] 254; GFX10-64-NEXT: s_cbranch_execz BB2_3 255; GFX10-64-NEXT: ; %bb.1: ; %.demote 256; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec 257; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 258; GFX10-64-NEXT: ; %bb.2: ; %.demote 259; GFX10-64-NEXT: s_mov_b64 exec, 0 260; GFX10-64-NEXT: BB2_3: ; %.continue 261; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] 262; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 263; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 264; GFX10-64-NEXT: s_endpgm 265; GFX10-64-NEXT: BB2_4: 266; GFX10-64-NEXT: s_mov_b64 exec, 0 267; GFX10-64-NEXT: exp null off, off, off, off done vm 268; GFX10-64-NEXT: s_endpgm 269.entry: 270 %i0 = fptosi float %arg0 to i32 271 %i1 = fptosi float %arg1 to i32 272 %c0 = or i32 %i0, %i1 273 %c1 = and i32 %c0, 1 274 %c2 = icmp eq i32 %c1, 0 275 br i1 %c2, label %.continue, label %.demote 276 277.demote: 278 call void @llvm.amdgcn.wqm.demote(i1 false) 279 br label %.continue 280 281.continue: 282 %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 283 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 284 ret void 285} 286 287 288define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 289; SI-LABEL: wqm_demote_1: 290; SI: ; %bb.0: ; %.entry 291; SI-NEXT: s_mov_b64 s[12:13], exec 292; SI-NEXT: s_wqm_b64 exec, exec 293; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 294; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc 295; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 296; SI-NEXT: s_cbranch_execz BB3_3 297; SI-NEXT: ; %bb.1: ; %.demote 298; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 299; SI-NEXT: s_cbranch_scc0 BB3_4 300; SI-NEXT: ; %bb.2: ; %.demote 301; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] 302; SI-NEXT: s_and_b64 exec, exec, s[16:17] 303; SI-NEXT: BB3_3: ; %.continue 304; SI-NEXT: s_or_b64 exec, exec, s[14:15] 305; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 306; SI-NEXT: s_waitcnt vmcnt(0) 307; SI-NEXT: v_add_f32_e32 v0, v0, v0 308; SI-NEXT: s_and_b64 exec, exec, s[12:13] 309; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 310; SI-NEXT: s_waitcnt vmcnt(0) 311; SI-NEXT: s_branch BB3_5 312; SI-NEXT: BB3_4: 313; SI-NEXT: s_mov_b64 exec, 0 314; SI-NEXT: exp null off, off, off, off done vm 315; SI-NEXT: s_endpgm 316; SI-NEXT: BB3_5: 317; 318; GFX9-LABEL: wqm_demote_1: 319; GFX9: ; %bb.0: ; %.entry 320; GFX9-NEXT: s_mov_b64 s[12:13], exec 321; GFX9-NEXT: s_wqm_b64 exec, exec 322; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 323; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc 324; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 325; GFX9-NEXT: s_cbranch_execz BB3_3 326; GFX9-NEXT: ; %bb.1: ; %.demote 327; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 328; GFX9-NEXT: s_cbranch_scc0 BB3_4 329; GFX9-NEXT: ; %bb.2: ; %.demote 330; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] 331; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] 332; GFX9-NEXT: BB3_3: ; %.continue 333; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] 334; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 335; GFX9-NEXT: s_waitcnt vmcnt(0) 336; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 337; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 338; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 339; GFX9-NEXT: s_waitcnt vmcnt(0) 340; GFX9-NEXT: s_branch BB3_5 341; GFX9-NEXT: BB3_4: 342; GFX9-NEXT: s_mov_b64 exec, 0 343; GFX9-NEXT: exp null off, off, off, off done vm 344; GFX9-NEXT: s_endpgm 345; GFX9-NEXT: BB3_5: 346; 347; GFX10-32-LABEL: wqm_demote_1: 348; GFX10-32: ; %bb.0: ; %.entry 349; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 350; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 351; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 352; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo 353; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 354; GFX10-32-NEXT: s_cbranch_execz BB3_3 355; GFX10-32-NEXT: ; %bb.1: ; %.demote 356; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo 357; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 358; GFX10-32-NEXT: ; %bb.2: ; %.demote 359; GFX10-32-NEXT: s_wqm_b32 s14, s12 360; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 361; GFX10-32-NEXT: BB3_3: ; %.continue 362; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 363; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 364; GFX10-32-NEXT: s_waitcnt vmcnt(0) 365; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 366; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 367; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 368; GFX10-32-NEXT: s_waitcnt vmcnt(0) 369; GFX10-32-NEXT: s_branch BB3_5 370; GFX10-32-NEXT: BB3_4: 371; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 372; GFX10-32-NEXT: exp null off, off, off, off done vm 373; GFX10-32-NEXT: s_endpgm 374; GFX10-32-NEXT: BB3_5: 375; 376; GFX10-64-LABEL: wqm_demote_1: 377; GFX10-64: ; %bb.0: ; %.entry 378; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 379; GFX10-64-NEXT: s_wqm_b64 exec, exec 380; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 381; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc 382; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 383; GFX10-64-NEXT: s_cbranch_execz BB3_3 384; GFX10-64-NEXT: ; %bb.1: ; %.demote 385; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 386; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 387; GFX10-64-NEXT: ; %bb.2: ; %.demote 388; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] 389; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] 390; GFX10-64-NEXT: BB3_3: ; %.continue 391; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] 392; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 393; GFX10-64-NEXT: s_waitcnt vmcnt(0) 394; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 395; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 396; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 397; GFX10-64-NEXT: s_waitcnt vmcnt(0) 398; GFX10-64-NEXT: s_branch BB3_5 399; GFX10-64-NEXT: BB3_4: 400; GFX10-64-NEXT: s_mov_b64 exec, 0 401; GFX10-64-NEXT: exp null off, off, off, off done vm 402; GFX10-64-NEXT: s_endpgm 403; GFX10-64-NEXT: BB3_5: 404.entry: 405 %z.cmp = fcmp olt float %z, 0.0 406 br i1 %z.cmp, label %.continue, label %.demote 407 408.demote: 409 call void @llvm.amdgcn.wqm.demote(i1 false) 410 br label %.continue 411 412.continue: 413 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 414 %tex0 = extractelement <4 x float> %tex, i32 0 415 %tex1 = extractelement <4 x float> %tex, i32 0 416 %coord1 = fadd float %tex0, %tex1 417 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 418 419 ret <4 x float> %rtex 420} 421 422define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 423; SI-LABEL: wqm_demote_2: 424; SI: ; %bb.0: ; %.entry 425; SI-NEXT: s_mov_b64 s[12:13], exec 426; SI-NEXT: s_wqm_b64 exec, exec 427; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 428; SI-NEXT: s_waitcnt vmcnt(0) 429; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 430; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc 431; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 432; SI-NEXT: s_cbranch_execz BB4_3 433; SI-NEXT: ; %bb.1: ; %.demote 434; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 435; SI-NEXT: s_cbranch_scc0 BB4_4 436; SI-NEXT: ; %bb.2: ; %.demote 437; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] 438; SI-NEXT: s_and_b64 exec, exec, s[16:17] 439; SI-NEXT: BB4_3: ; %.continue 440; SI-NEXT: s_or_b64 exec, exec, s[14:15] 441; SI-NEXT: v_add_f32_e32 v0, v0, v0 442; SI-NEXT: s_and_b64 exec, exec, s[12:13] 443; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 444; SI-NEXT: s_waitcnt vmcnt(0) 445; SI-NEXT: s_branch BB4_5 446; SI-NEXT: BB4_4: 447; SI-NEXT: s_mov_b64 exec, 0 448; SI-NEXT: exp null off, off, off, off done vm 449; SI-NEXT: s_endpgm 450; SI-NEXT: BB4_5: 451; 452; GFX9-LABEL: wqm_demote_2: 453; GFX9: ; %bb.0: ; %.entry 454; GFX9-NEXT: s_mov_b64 s[12:13], exec 455; GFX9-NEXT: s_wqm_b64 exec, exec 456; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 457; GFX9-NEXT: s_waitcnt vmcnt(0) 458; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 459; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc 460; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 461; GFX9-NEXT: s_cbranch_execz BB4_3 462; GFX9-NEXT: ; %bb.1: ; %.demote 463; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 464; GFX9-NEXT: s_cbranch_scc0 BB4_4 465; GFX9-NEXT: ; %bb.2: ; %.demote 466; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] 467; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] 468; GFX9-NEXT: BB4_3: ; %.continue 469; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] 470; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 471; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 472; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 473; GFX9-NEXT: s_waitcnt vmcnt(0) 474; GFX9-NEXT: s_branch BB4_5 475; GFX9-NEXT: BB4_4: 476; GFX9-NEXT: s_mov_b64 exec, 0 477; GFX9-NEXT: exp null off, off, off, off done vm 478; GFX9-NEXT: s_endpgm 479; GFX9-NEXT: BB4_5: 480; 481; GFX10-32-LABEL: wqm_demote_2: 482; GFX10-32: ; %bb.0: ; %.entry 483; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 484; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 485; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 486; GFX10-32-NEXT: s_waitcnt vmcnt(0) 487; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 488; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo 489; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 490; GFX10-32-NEXT: s_cbranch_execz BB4_3 491; GFX10-32-NEXT: ; %bb.1: ; %.demote 492; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo 493; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 494; GFX10-32-NEXT: ; %bb.2: ; %.demote 495; GFX10-32-NEXT: s_wqm_b32 s14, s12 496; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 497; GFX10-32-NEXT: BB4_3: ; %.continue 498; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 499; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 500; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 501; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 502; GFX10-32-NEXT: s_waitcnt vmcnt(0) 503; GFX10-32-NEXT: s_branch BB4_5 504; GFX10-32-NEXT: BB4_4: 505; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 506; GFX10-32-NEXT: exp null off, off, off, off done vm 507; GFX10-32-NEXT: s_endpgm 508; GFX10-32-NEXT: BB4_5: 509; 510; GFX10-64-LABEL: wqm_demote_2: 511; GFX10-64: ; %bb.0: ; %.entry 512; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 513; GFX10-64-NEXT: s_wqm_b64 exec, exec 514; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 515; GFX10-64-NEXT: s_waitcnt vmcnt(0) 516; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 517; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc 518; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 519; GFX10-64-NEXT: s_cbranch_execz BB4_3 520; GFX10-64-NEXT: ; %bb.1: ; %.demote 521; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 522; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 523; GFX10-64-NEXT: ; %bb.2: ; %.demote 524; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] 525; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] 526; GFX10-64-NEXT: BB4_3: ; %.continue 527; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] 528; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 529; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 530; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 531; GFX10-64-NEXT: s_waitcnt vmcnt(0) 532; GFX10-64-NEXT: s_branch BB4_5 533; GFX10-64-NEXT: BB4_4: 534; GFX10-64-NEXT: s_mov_b64 exec, 0 535; GFX10-64-NEXT: exp null off, off, off, off done vm 536; GFX10-64-NEXT: s_endpgm 537; GFX10-64-NEXT: BB4_5: 538.entry: 539 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 540 %tex0 = extractelement <4 x float> %tex, i32 0 541 %tex1 = extractelement <4 x float> %tex, i32 0 542 %z.cmp = fcmp olt float %tex0, 0.0 543 br i1 %z.cmp, label %.continue, label %.demote 544 545.demote: 546 call void @llvm.amdgcn.wqm.demote(i1 false) 547 br label %.continue 548 549.continue: 550 %coord1 = fadd float %tex0, %tex1 551 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 552 553 ret <4 x float> %rtex 554} 555 556define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 557; SI-LABEL: wqm_demote_dynamic: 558; SI: ; %bb.0: ; %.entry 559; SI-NEXT: s_mov_b64 s[12:13], exec 560; SI-NEXT: s_wqm_b64 exec, exec 561; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 562; SI-NEXT: s_waitcnt vmcnt(0) 563; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 564; SI-NEXT: s_xor_b64 s[14:15], vcc, exec 565; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 566; SI-NEXT: s_cbranch_scc0 BB5_2 567; SI-NEXT: ; %bb.1: ; %.entry 568; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] 569; SI-NEXT: s_and_b64 exec, exec, s[14:15] 570; SI-NEXT: v_add_f32_e32 v0, v0, v0 571; SI-NEXT: s_and_b64 exec, exec, s[12:13] 572; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 573; SI-NEXT: s_waitcnt vmcnt(0) 574; SI-NEXT: s_branch BB5_3 575; SI-NEXT: BB5_2: 576; SI-NEXT: s_mov_b64 exec, 0 577; SI-NEXT: exp null off, off, off, off done vm 578; SI-NEXT: s_endpgm 579; SI-NEXT: BB5_3: 580; 581; GFX9-LABEL: wqm_demote_dynamic: 582; GFX9: ; %bb.0: ; %.entry 583; GFX9-NEXT: s_mov_b64 s[12:13], exec 584; GFX9-NEXT: s_wqm_b64 exec, exec 585; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 586; GFX9-NEXT: s_waitcnt vmcnt(0) 587; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 588; GFX9-NEXT: s_xor_b64 s[14:15], vcc, exec 589; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 590; GFX9-NEXT: s_cbranch_scc0 BB5_2 591; GFX9-NEXT: ; %bb.1: ; %.entry 592; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] 593; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] 594; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 595; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 596; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 597; GFX9-NEXT: s_waitcnt vmcnt(0) 598; GFX9-NEXT: s_branch BB5_3 599; GFX9-NEXT: BB5_2: 600; GFX9-NEXT: s_mov_b64 exec, 0 601; GFX9-NEXT: exp null off, off, off, off done vm 602; GFX9-NEXT: s_endpgm 603; GFX9-NEXT: BB5_3: 604; 605; GFX10-32-LABEL: wqm_demote_dynamic: 606; GFX10-32: ; %bb.0: ; %.entry 607; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 608; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 609; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 610; GFX10-32-NEXT: s_waitcnt vmcnt(0) 611; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 612; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, exec_lo 613; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 614; GFX10-32-NEXT: s_cbranch_scc0 BB5_2 615; GFX10-32-NEXT: ; %bb.1: ; %.entry 616; GFX10-32-NEXT: s_wqm_b32 s13, s12 617; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 618; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 619; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 620; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 621; GFX10-32-NEXT: s_waitcnt vmcnt(0) 622; GFX10-32-NEXT: s_branch BB5_3 623; GFX10-32-NEXT: BB5_2: 624; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 625; GFX10-32-NEXT: exp null off, off, off, off done vm 626; GFX10-32-NEXT: s_endpgm 627; GFX10-32-NEXT: BB5_3: 628; 629; GFX10-64-LABEL: wqm_demote_dynamic: 630; GFX10-64: ; %bb.0: ; %.entry 631; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 632; GFX10-64-NEXT: s_wqm_b64 exec, exec 633; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 634; GFX10-64-NEXT: s_waitcnt vmcnt(0) 635; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 636; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, exec 637; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 638; GFX10-64-NEXT: s_cbranch_scc0 BB5_2 639; GFX10-64-NEXT: ; %bb.1: ; %.entry 640; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] 641; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] 642; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 643; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 644; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 645; GFX10-64-NEXT: s_waitcnt vmcnt(0) 646; GFX10-64-NEXT: s_branch BB5_3 647; GFX10-64-NEXT: BB5_2: 648; GFX10-64-NEXT: s_mov_b64 exec, 0 649; GFX10-64-NEXT: exp null off, off, off, off done vm 650; GFX10-64-NEXT: s_endpgm 651; GFX10-64-NEXT: BB5_3: 652.entry: 653 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 654 %tex0 = extractelement <4 x float> %tex, i32 0 655 %tex1 = extractelement <4 x float> %tex, i32 0 656 %z.cmp = fcmp olt float %tex0, 0.0 657 call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) 658 %coord1 = fadd float %tex0, %tex1 659 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 660 661 ret <4 x float> %rtex 662} 663 664 665define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { 666; SI-LABEL: wqm_deriv: 667; SI: ; %bb.0: ; %.entry 668; SI-NEXT: s_mov_b64 s[0:1], exec 669; SI-NEXT: s_wqm_b64 exec, exec 670; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 671; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 672; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc 673; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 674; SI-NEXT: s_cbranch_execz BB6_3 675; SI-NEXT: ; %bb.1: ; %.demote0 676; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 677; SI-NEXT: s_cbranch_scc0 BB6_7 678; SI-NEXT: ; %bb.2: ; %.demote0 679; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] 680; SI-NEXT: s_and_b64 exec, exec, s[4:5] 681; SI-NEXT: BB6_3: ; %.continue0 682; SI-NEXT: s_or_b64 exec, exec, s[2:3] 683; SI-NEXT: s_mov_b64 s[2:3], s[0:1] 684; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 685; SI-NEXT: v_mov_b32_e32 v1, v0 686; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 687; SI-NEXT: s_nop 0 688; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 689; SI-NEXT: s_nop 1 690; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 691; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 692; SI-NEXT: s_and_b64 exec, exec, s[0:1] 693; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 694; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc 695; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 696; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 697; SI-NEXT: s_cbranch_execz BB6_6 698; SI-NEXT: ; %bb.4: ; %.demote1 699; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 700; SI-NEXT: s_cbranch_scc0 BB6_7 701; SI-NEXT: ; %bb.5: ; %.demote1 702; SI-NEXT: s_mov_b64 exec, 0 703; SI-NEXT: BB6_6: ; %.continue1 704; SI-NEXT: s_or_b64 exec, exec, s[2:3] 705; SI-NEXT: v_bfrev_b32_e32 v0, 60 706; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 707; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm 708; SI-NEXT: s_endpgm 709; SI-NEXT: BB6_7: 710; SI-NEXT: s_mov_b64 exec, 0 711; SI-NEXT: exp null off, off, off, off done vm 712; SI-NEXT: s_endpgm 713; 714; GFX9-LABEL: wqm_deriv: 715; GFX9: ; %bb.0: ; %.entry 716; GFX9-NEXT: s_mov_b64 s[0:1], exec 717; GFX9-NEXT: s_wqm_b64 exec, exec 718; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 719; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 720; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 721; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 722; GFX9-NEXT: s_cbranch_execz BB6_3 723; GFX9-NEXT: ; %bb.1: ; %.demote0 724; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 725; GFX9-NEXT: s_cbranch_scc0 BB6_7 726; GFX9-NEXT: ; %bb.2: ; %.demote0 727; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] 728; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] 729; GFX9-NEXT: BB6_3: ; %.continue0 730; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 731; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] 732; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 733; GFX9-NEXT: v_mov_b32_e32 v1, v0 734; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 735; GFX9-NEXT: s_nop 0 736; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 737; GFX9-NEXT: s_nop 1 738; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 739; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 740; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] 741; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 742; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc 743; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 744; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 745; GFX9-NEXT: s_cbranch_execz BB6_6 746; GFX9-NEXT: ; %bb.4: ; %.demote1 747; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 748; GFX9-NEXT: s_cbranch_scc0 BB6_7 749; GFX9-NEXT: ; %bb.5: ; %.demote1 750; GFX9-NEXT: s_mov_b64 exec, 0 751; GFX9-NEXT: BB6_6: ; %.continue1 752; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 753; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 754; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 755; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 756; GFX9-NEXT: s_endpgm 757; GFX9-NEXT: BB6_7: 758; GFX9-NEXT: s_mov_b64 exec, 0 759; GFX9-NEXT: exp null off, off, off, off done vm 760; GFX9-NEXT: s_endpgm 761; 762; GFX10-32-LABEL: wqm_deriv: 763; GFX10-32: ; %bb.0: ; %.entry 764; GFX10-32-NEXT: s_mov_b32 s0, exec_lo 765; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 766; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 767; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 768; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo 769; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 770; GFX10-32-NEXT: s_cbranch_execz BB6_3 771; GFX10-32-NEXT: ; %bb.1: ; %.demote0 772; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 773; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 774; GFX10-32-NEXT: ; %bb.2: ; %.demote0 775; GFX10-32-NEXT: s_wqm_b32 s2, s0 776; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 777; GFX10-32-NEXT: BB6_3: ; %.continue0 778; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 779; GFX10-32-NEXT: s_mov_b32 s1, s0 780; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 781; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 782; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 783; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 784; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 785; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 786; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 787; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 788; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo 789; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 790; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 791; GFX10-32-NEXT: s_cbranch_execz BB6_6 792; GFX10-32-NEXT: ; %bb.4: ; %.demote1 793; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 794; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 795; GFX10-32-NEXT: ; %bb.5: ; %.demote1 796; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 797; GFX10-32-NEXT: BB6_6: ; %.continue1 798; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 799; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 800; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 801; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 802; GFX10-32-NEXT: s_endpgm 803; GFX10-32-NEXT: BB6_7: 804; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 805; GFX10-32-NEXT: exp null off, off, off, off done vm 806; GFX10-32-NEXT: s_endpgm 807; 808; GFX10-64-LABEL: wqm_deriv: 809; GFX10-64: ; %bb.0: ; %.entry 810; GFX10-64-NEXT: s_mov_b64 s[0:1], exec 811; GFX10-64-NEXT: s_wqm_b64 exec, exec 812; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 813; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 814; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc 815; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 816; GFX10-64-NEXT: s_cbranch_execz BB6_3 817; GFX10-64-NEXT: ; %bb.1: ; %.demote0 818; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 819; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 820; GFX10-64-NEXT: ; %bb.2: ; %.demote0 821; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] 822; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] 823; GFX10-64-NEXT: BB6_3: ; %.continue0 824; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 825; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] 826; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 827; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 828; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 829; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 830; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 831; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] 832; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 833; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 834; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc 835; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 836; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 837; GFX10-64-NEXT: s_cbranch_execz BB6_6 838; GFX10-64-NEXT: ; %bb.4: ; %.demote1 839; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 840; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 841; GFX10-64-NEXT: ; %bb.5: ; %.demote1 842; GFX10-64-NEXT: s_mov_b64 exec, 0 843; GFX10-64-NEXT: BB6_6: ; %.continue1 844; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 845; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 846; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 847; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 848; GFX10-64-NEXT: s_endpgm 849; GFX10-64-NEXT: BB6_7: 850; GFX10-64-NEXT: s_mov_b64 exec, 0 851; GFX10-64-NEXT: exp null off, off, off, off done vm 852; GFX10-64-NEXT: s_endpgm 853.entry: 854 %p0 = extractelement <2 x float> %input, i32 0 855 %p1 = extractelement <2 x float> %input, i32 1 856 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 857 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 858 %argi = fptosi float %arg to i32 859 %cond0 = icmp eq i32 %argi, 0 860 br i1 %cond0, label %.continue0, label %.demote0 861 862.demote0: 863 call void @llvm.amdgcn.wqm.demote(i1 false) 864 br label %.continue0 865 866.continue0: 867 %live = call i1 @llvm.amdgcn.live.mask() 868 %live.cond = select i1 %live, i32 0, i32 1065353216 869 %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) 870 %live.v0f = bitcast i32 %live.v0 to float 871 %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) 872 %live.v1f = bitcast i32 %live.v1 to float 873 %v0 = fsub float %live.v0f, %live.v1f 874 %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) 875 %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 876 %cond2 = and i1 %live, %cond1 877 br i1 %cond2, label %.continue1, label %.demote1 878 879.demote1: 880 call void @llvm.amdgcn.wqm.demote(i1 false) 881 br label %.continue1 882 883.continue1: 884 call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3 885 ret void 886} 887 888define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { 889; SI-LABEL: wqm_deriv_loop: 890; SI: ; %bb.0: ; %.entry 891; SI-NEXT: s_mov_b64 s[0:1], exec 892; SI-NEXT: s_wqm_b64 exec, exec 893; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 894; SI-NEXT: s_mov_b32 s2, 0 895; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 896; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc 897; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 898; SI-NEXT: s_cbranch_execz BB7_3 899; SI-NEXT: ; %bb.1: ; %.demote0 900; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 901; SI-NEXT: s_cbranch_scc0 BB7_9 902; SI-NEXT: ; %bb.2: ; %.demote0 903; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] 904; SI-NEXT: s_and_b64 exec, exec, s[6:7] 905; SI-NEXT: BB7_3: ; %.continue0.preheader 906; SI-NEXT: s_or_b64 exec, exec, s[4:5] 907; SI-NEXT: s_mov_b64 s[4:5], 0 908; SI-NEXT: s_branch BB7_5 909; SI-NEXT: BB7_4: ; %.continue1 910; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 911; SI-NEXT: s_or_b64 exec, exec, s[6:7] 912; SI-NEXT: s_add_i32 s2, s2, 1 913; SI-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 914; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 915; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] 916; SI-NEXT: s_cbranch_execz BB7_8 917; SI-NEXT: BB7_5: ; %.continue0 918; SI-NEXT: ; =>This Inner Loop Header: Depth=1 919; SI-NEXT: v_mov_b32_e32 v0, s2 920; SI-NEXT: s_mov_b64 s[6:7], s[0:1] 921; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] 922; SI-NEXT: v_mov_b32_e32 v2, v0 923; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1 924; SI-NEXT: s_nop 0 925; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 926; SI-NEXT: s_nop 1 927; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 928; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 929; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 930; SI-NEXT: s_or_b64 s[6:7], s[6:7], vcc 931; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] 932; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] 933; SI-NEXT: s_cbranch_execz BB7_4 934; SI-NEXT: ; %bb.6: ; %.demote1 935; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 936; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 937; SI-NEXT: s_cbranch_scc0 BB7_9 938; SI-NEXT: ; %bb.7: ; %.demote1 939; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 940; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] 941; SI-NEXT: s_and_b64 exec, exec, s[8:9] 942; SI-NEXT: s_branch BB7_4 943; SI-NEXT: BB7_8: ; %.return 944; SI-NEXT: s_or_b64 exec, exec, s[4:5] 945; SI-NEXT: s_and_b64 exec, exec, s[0:1] 946; SI-NEXT: v_bfrev_b32_e32 v0, 60 947; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 948; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm 949; SI-NEXT: s_endpgm 950; SI-NEXT: BB7_9: 951; SI-NEXT: s_mov_b64 exec, 0 952; SI-NEXT: exp null off, off, off, off done vm 953; SI-NEXT: s_endpgm 954; 955; GFX9-LABEL: wqm_deriv_loop: 956; GFX9: ; %bb.0: ; %.entry 957; GFX9-NEXT: s_mov_b64 s[0:1], exec 958; GFX9-NEXT: s_wqm_b64 exec, exec 959; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 960; GFX9-NEXT: s_mov_b32 s2, 0 961; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 962; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc 963; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 964; GFX9-NEXT: s_cbranch_execz BB7_3 965; GFX9-NEXT: ; %bb.1: ; %.demote0 966; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 967; GFX9-NEXT: s_cbranch_scc0 BB7_9 968; GFX9-NEXT: ; %bb.2: ; %.demote0 969; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] 970; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] 971; GFX9-NEXT: BB7_3: ; %.continue0.preheader 972; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 973; GFX9-NEXT: s_mov_b64 s[4:5], 0 974; GFX9-NEXT: s_branch BB7_5 975; GFX9-NEXT: BB7_4: ; %.continue1 976; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 977; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] 978; GFX9-NEXT: s_add_i32 s2, s2, 1 979; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 980; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 981; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] 982; GFX9-NEXT: s_cbranch_execz BB7_8 983; GFX9-NEXT: BB7_5: ; %.continue0 984; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 985; GFX9-NEXT: v_mov_b32_e32 v0, s2 986; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] 987; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] 988; GFX9-NEXT: v_mov_b32_e32 v2, v0 989; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1 990; GFX9-NEXT: s_nop 0 991; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 992; GFX9-NEXT: s_nop 1 993; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 994; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 995; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 996; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], vcc 997; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] 998; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9] 999; GFX9-NEXT: s_cbranch_execz BB7_4 1000; GFX9-NEXT: ; %bb.6: ; %.demote1 1001; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 1002; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 1003; GFX9-NEXT: s_cbranch_scc0 BB7_9 1004; GFX9-NEXT: ; %bb.7: ; %.demote1 1005; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 1006; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] 1007; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] 1008; GFX9-NEXT: s_branch BB7_4 1009; GFX9-NEXT: BB7_8: ; %.return 1010; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 1011; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] 1012; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 1013; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 1014; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1015; GFX9-NEXT: s_endpgm 1016; GFX9-NEXT: BB7_9: 1017; GFX9-NEXT: s_mov_b64 exec, 0 1018; GFX9-NEXT: exp null off, off, off, off done vm 1019; GFX9-NEXT: s_endpgm 1020; 1021; GFX10-32-LABEL: wqm_deriv_loop: 1022; GFX10-32: ; %bb.0: ; %.entry 1023; GFX10-32-NEXT: s_mov_b32 s0, exec_lo 1024; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 1025; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 1026; GFX10-32-NEXT: s_mov_b32 s1, 0 1027; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1028; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo 1029; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 1030; GFX10-32-NEXT: s_cbranch_execz BB7_3 1031; GFX10-32-NEXT: ; %bb.1: ; %.demote0 1032; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 1033; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 1034; GFX10-32-NEXT: ; %bb.2: ; %.demote0 1035; GFX10-32-NEXT: s_wqm_b32 s3, s0 1036; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 1037; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader 1038; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 1039; GFX10-32-NEXT: s_mov_b32 s2, 0 1040; GFX10-32-NEXT: s_branch BB7_5 1041; GFX10-32-NEXT: BB7_4: ; %.continue1 1042; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1043; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 1044; GFX10-32-NEXT: s_add_i32 s2, s2, 1 1045; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 1046; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 1047; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 1048; GFX10-32-NEXT: s_cbranch_execz BB7_8 1049; GFX10-32-NEXT: BB7_5: ; %.continue0 1050; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 1051; GFX10-32-NEXT: s_mov_b32 s3, s0 1052; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 1053; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 1054; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 1055; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 1056; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 1057; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 1058; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 1059; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo 1060; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 1061; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 1062; GFX10-32-NEXT: s_cbranch_execz BB7_4 1063; GFX10-32-NEXT: ; %bb.6: ; %.demote1 1064; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1065; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 1066; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 1067; GFX10-32-NEXT: ; %bb.7: ; %.demote1 1068; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1069; GFX10-32-NEXT: s_wqm_b32 s4, s0 1070; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 1071; GFX10-32-NEXT: s_branch BB7_4 1072; GFX10-32-NEXT: BB7_8: ; %.return 1073; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1074; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1075; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 1076; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 1077; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1078; GFX10-32-NEXT: s_endpgm 1079; GFX10-32-NEXT: BB7_9: 1080; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 1081; GFX10-32-NEXT: exp null off, off, off, off done vm 1082; GFX10-32-NEXT: s_endpgm 1083; 1084; GFX10-64-LABEL: wqm_deriv_loop: 1085; GFX10-64: ; %bb.0: ; %.entry 1086; GFX10-64-NEXT: s_mov_b64 s[0:1], exec 1087; GFX10-64-NEXT: s_wqm_b64 exec, exec 1088; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 1089; GFX10-64-NEXT: s_mov_b32 s2, 0 1090; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1091; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc 1092; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 1093; GFX10-64-NEXT: s_cbranch_execz BB7_3 1094; GFX10-64-NEXT: ; %bb.1: ; %.demote0 1095; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 1096; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 1097; GFX10-64-NEXT: ; %bb.2: ; %.demote0 1098; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] 1099; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] 1100; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader 1101; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] 1102; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 1103; GFX10-64-NEXT: s_branch BB7_5 1104; GFX10-64-NEXT: BB7_4: ; %.continue1 1105; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1106; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] 1107; GFX10-64-NEXT: s_add_i32 s2, s2, 1 1108; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 1109; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1110; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] 1111; GFX10-64-NEXT: s_cbranch_execz BB7_8 1112; GFX10-64-NEXT: BB7_5: ; %.continue0 1113; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 1114; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] 1115; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s2, 0, s[6:7] 1116; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1 1117; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 1118; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 1119; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 1120; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 1121; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 1122; GFX10-64-NEXT: s_or_b64 s[6:7], s[6:7], vcc 1123; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] 1124; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9] 1125; GFX10-64-NEXT: s_cbranch_execz BB7_4 1126; GFX10-64-NEXT: ; %bb.6: ; %.demote1 1127; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1128; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 1129; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 1130; GFX10-64-NEXT: ; %bb.7: ; %.demote1 1131; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1132; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] 1133; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] 1134; GFX10-64-NEXT: s_branch BB7_4 1135; GFX10-64-NEXT: BB7_8: ; %.return 1136; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] 1137; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] 1138; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 1139; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 1140; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1141; GFX10-64-NEXT: s_endpgm 1142; GFX10-64-NEXT: BB7_9: 1143; GFX10-64-NEXT: s_mov_b64 exec, 0 1144; GFX10-64-NEXT: exp null off, off, off, off done vm 1145; GFX10-64-NEXT: s_endpgm 1146.entry: 1147 %p0 = extractelement <2 x float> %input, i32 0 1148 %p1 = extractelement <2 x float> %input, i32 1 1149 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 1150 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 1151 %argi = fptosi float %arg to i32 1152 %cond0 = icmp eq i32 %argi, 0 1153 br i1 %cond0, label %.continue0, label %.demote0 1154 1155.demote0: 1156 call void @llvm.amdgcn.wqm.demote(i1 false) 1157 br label %.continue0 1158 1159.continue0: 1160 %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] 1161 %live = call i1 @llvm.amdgcn.live.mask() 1162 %live.cond = select i1 %live, i32 0, i32 %count 1163 %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) 1164 %live.v0f = bitcast i32 %live.v0 to float 1165 %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) 1166 %live.v1f = bitcast i32 %live.v1 to float 1167 %v0 = fsub float %live.v0f, %live.v1f 1168 %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) 1169 %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 1170 %cond2 = and i1 %live, %cond1 1171 br i1 %cond2, label %.continue1, label %.demote1 1172 1173.demote1: 1174 call void @llvm.amdgcn.wqm.demote(i1 false) 1175 br label %.continue1 1176 1177.continue1: 1178 %next = add i32 %count, 1 1179 %loop.cond = icmp slt i32 %next, %limit 1180 br i1 %loop.cond, label %.continue0, label %.return 1181 1182.return: 1183 call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3 1184 ret void 1185} 1186 1187define amdgpu_ps void @static_exact_nop(float %arg0, float %arg1) { 1188; SI-LABEL: static_exact_nop: 1189; SI: ; %bb.0: ; %.entry 1190; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 1191; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 1192; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1193; SI-NEXT: s_endpgm 1194; 1195; GFX9-LABEL: static_exact_nop: 1196; GFX9: ; %bb.0: ; %.entry 1197; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 1198; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 1199; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1200; GFX9-NEXT: s_endpgm 1201; 1202; GFX10-32-LABEL: static_exact_nop: 1203; GFX10-32: ; %bb.0: ; %.entry 1204; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 1205; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 1206; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1207; GFX10-32-NEXT: s_endpgm 1208; 1209; GFX10-64-LABEL: static_exact_nop: 1210; GFX10-64: ; %bb.0: ; %.entry 1211; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 1212; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 1213; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1214; GFX10-64-NEXT: s_endpgm 1215.entry: 1216 %c0 = fcmp olt float %arg0, 0.000000e+00 1217 %c1 = fcmp oge float %arg1, 0.0 1218 call void @llvm.amdgcn.wqm.demote(i1 true) 1219 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 1220 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 1221 ret void 1222} 1223 1224 1225declare void @llvm.amdgcn.wqm.demote(i1) #0 1226declare i1 @llvm.amdgcn.live.mask() #0 1227declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 1228declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1229declare float @llvm.amdgcn.wqm.f32(float) #1 1230declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 1231declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 1232declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 1233declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 1234 1235attributes #0 = { nounwind } 1236attributes #1 = { nounwind readnone } 1237attributes #2 = { nounwind readnone speculatable } 1238attributes #3 = { inaccessiblememonly nounwind } 1239attributes #4 = { convergent nounwind readnone } 1240