1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 3; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 5 6 7define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) { 8; GFX7-LABEL: test_div_scale_f32_1: 9; GFX7: ; %bb.0: 10; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 12; GFX7-NEXT: v_mov_b32_e32 v1, 0 13; GFX7-NEXT: s_mov_b32 s6, 0 14; GFX7-NEXT: s_mov_b32 s7, 0xf000 15; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 17; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 18; GFX7-NEXT: s_waitcnt vmcnt(0) 19; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc 20; GFX7-NEXT: s_waitcnt vmcnt(0) 21; GFX7-NEXT: s_mov_b32 s6, -1 22; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 23; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 24; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 25; GFX7-NEXT: s_endpgm 26; 27; GFX8-LABEL: test_div_scale_f32_1: 28; GFX8: ; %bb.0: 29; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 30; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 31; GFX8-NEXT: s_waitcnt lgkmcnt(0) 32; GFX8-NEXT: v_mov_b32_e32 v0, s2 33; GFX8-NEXT: v_mov_b32_e32 v1, s3 34; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 35; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 36; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 37; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 38; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 39; GFX8-NEXT: s_waitcnt vmcnt(0) 40; GFX8-NEXT: flat_load_dword v1, v[2:3] glc 41; GFX8-NEXT: s_waitcnt vmcnt(0) 42; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 43; GFX8-NEXT: v_mov_b32_e32 v0, s0 44; GFX8-NEXT: v_mov_b32_e32 v1, s1 45; GFX8-NEXT: flat_store_dword v[0:1], v2 46; GFX8-NEXT: s_endpgm 47; 48; GFX10-LABEL: test_div_scale_f32_1: 49; GFX10: ; %bb.0: 50; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 51; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 52; GFX10-NEXT: s_waitcnt lgkmcnt(0) 53; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 54; GFX10-NEXT: s_waitcnt vmcnt(0) 55; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 56; GFX10-NEXT: s_waitcnt vmcnt(0) 57; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1 58; GFX10-NEXT: v_mov_b32_e32 v1, 0 59; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 60; GFX10-NEXT: s_endpgm 61 %tid = call i32 @llvm.amdgcn.workitem.id.x() 62 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 63 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 64 65 %a = load volatile float, float addrspace(1)* %gep.0, align 4 66 %b = load volatile float, float addrspace(1)* %gep.1, align 4 67 68 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 69 %result0 = extractvalue { float, i1 } %result, 0 70 store float %result0, float addrspace(1)* %out, align 4 71 ret void 72} 73 74define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) { 75; GFX7-LABEL: test_div_scale_f32_2: 76; GFX7: ; %bb.0: 77; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 78; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 79; GFX7-NEXT: v_mov_b32_e32 v1, 0 80; GFX7-NEXT: s_mov_b32 s6, 0 81; GFX7-NEXT: s_mov_b32 s7, 0xf000 82; GFX7-NEXT: s_waitcnt lgkmcnt(0) 83; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 84; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 85; GFX7-NEXT: s_waitcnt vmcnt(0) 86; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc 87; GFX7-NEXT: s_waitcnt vmcnt(0) 88; GFX7-NEXT: s_mov_b32 s6, -1 89; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v2, v0, v2 90; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 91; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 92; GFX7-NEXT: s_endpgm 93; 94; GFX8-LABEL: test_div_scale_f32_2: 95; GFX8: ; %bb.0: 96; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 97; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 98; GFX8-NEXT: s_waitcnt lgkmcnt(0) 99; GFX8-NEXT: v_mov_b32_e32 v0, s2 100; GFX8-NEXT: v_mov_b32_e32 v1, s3 101; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 102; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 103; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 104; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 105; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 106; GFX8-NEXT: s_waitcnt vmcnt(0) 107; GFX8-NEXT: flat_load_dword v1, v[2:3] glc 108; GFX8-NEXT: s_waitcnt vmcnt(0) 109; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0 110; GFX8-NEXT: v_mov_b32_e32 v0, s0 111; GFX8-NEXT: v_mov_b32_e32 v1, s1 112; GFX8-NEXT: flat_store_dword v[0:1], v2 113; GFX8-NEXT: s_endpgm 114; 115; GFX10-LABEL: test_div_scale_f32_2: 116; GFX10: ; %bb.0: 117; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 118; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 119; GFX10-NEXT: s_waitcnt lgkmcnt(0) 120; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 121; GFX10-NEXT: s_waitcnt vmcnt(0) 122; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 123; GFX10-NEXT: s_waitcnt vmcnt(0) 124; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1 125; GFX10-NEXT: v_mov_b32_e32 v1, 0 126; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 127; GFX10-NEXT: s_endpgm 128 %tid = call i32 @llvm.amdgcn.workitem.id.x() 129 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 130 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 131 132 %a = load volatile float, float addrspace(1)* %gep.0, align 4 133 %b = load volatile float, float addrspace(1)* %gep.1, align 4 134 135 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 136 %result0 = extractvalue { float, i1 } %result, 0 137 store float %result0, float addrspace(1)* %out, align 4 138 ret void 139} 140 141define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) { 142; GFX7-LABEL: test_div_scale_f64_1: 143; GFX7: ; %bb.0: 144; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 145; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 146; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 147; GFX7-NEXT: s_waitcnt lgkmcnt(0) 148; GFX7-NEXT: v_mov_b32_e32 v0, s2 149; GFX7-NEXT: v_mov_b32_e32 v1, s3 150; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 151; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 152; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 153; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 154; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc 155; GFX7-NEXT: s_waitcnt vmcnt(0) 156; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc 157; GFX7-NEXT: s_waitcnt vmcnt(0) 158; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] 159; GFX7-NEXT: v_mov_b32_e32 v3, s1 160; GFX7-NEXT: v_mov_b32_e32 v2, s0 161; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 162; GFX7-NEXT: s_endpgm 163; 164; GFX8-LABEL: test_div_scale_f64_1: 165; GFX8: ; %bb.0: 166; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 167; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 168; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 169; GFX8-NEXT: s_waitcnt lgkmcnt(0) 170; GFX8-NEXT: v_mov_b32_e32 v0, s2 171; GFX8-NEXT: v_mov_b32_e32 v1, s3 172; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 173; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 174; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 175; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 176; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc 177; GFX8-NEXT: s_waitcnt vmcnt(0) 178; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc 179; GFX8-NEXT: s_waitcnt vmcnt(0) 180; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] 181; GFX8-NEXT: v_mov_b32_e32 v3, s1 182; GFX8-NEXT: v_mov_b32_e32 v2, s0 183; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 184; GFX8-NEXT: s_endpgm 185; 186; GFX10-LABEL: test_div_scale_f64_1: 187; GFX10: ; %bb.0: 188; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 189; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 190; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 191; GFX10-NEXT: s_waitcnt lgkmcnt(0) 192; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc 193; GFX10-NEXT: s_waitcnt vmcnt(0) 194; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc 195; GFX10-NEXT: s_waitcnt vmcnt(0) 196; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] 197; GFX10-NEXT: v_mov_b32_e32 v2, 0 198; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 199; GFX10-NEXT: s_endpgm 200 %tid = call i32 @llvm.amdgcn.workitem.id.x() 201 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 202 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 203 204 %a = load volatile double, double addrspace(1)* %gep.0, align 8 205 %b = load volatile double, double addrspace(1)* %gep.1, align 8 206 207 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 208 %result0 = extractvalue { double, i1 } %result, 0 209 store double %result0, double addrspace(1)* %out, align 8 210 ret void 211} 212 213define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) { 214; GFX7-LABEL: test_div_scale_f64_2: 215; GFX7: ; %bb.0: 216; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 217; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 218; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 219; GFX7-NEXT: s_waitcnt lgkmcnt(0) 220; GFX7-NEXT: v_mov_b32_e32 v0, s2 221; GFX7-NEXT: v_mov_b32_e32 v1, s3 222; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 223; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 224; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 225; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 226; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc 227; GFX7-NEXT: s_waitcnt vmcnt(0) 228; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc 229; GFX7-NEXT: s_waitcnt vmcnt(0) 230; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] 231; GFX7-NEXT: v_mov_b32_e32 v3, s1 232; GFX7-NEXT: v_mov_b32_e32 v2, s0 233; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 234; GFX7-NEXT: s_endpgm 235; 236; GFX8-LABEL: test_div_scale_f64_2: 237; GFX8: ; %bb.0: 238; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 239; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 240; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 241; GFX8-NEXT: s_waitcnt lgkmcnt(0) 242; GFX8-NEXT: v_mov_b32_e32 v0, s2 243; GFX8-NEXT: v_mov_b32_e32 v1, s3 244; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 245; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 246; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 247; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 248; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc 249; GFX8-NEXT: s_waitcnt vmcnt(0) 250; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc 251; GFX8-NEXT: s_waitcnt vmcnt(0) 252; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] 253; GFX8-NEXT: v_mov_b32_e32 v3, s1 254; GFX8-NEXT: v_mov_b32_e32 v2, s0 255; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 256; GFX8-NEXT: s_endpgm 257; 258; GFX10-LABEL: test_div_scale_f64_2: 259; GFX10: ; %bb.0: 260; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 261; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 262; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 263; GFX10-NEXT: s_waitcnt lgkmcnt(0) 264; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc 265; GFX10-NEXT: s_waitcnt vmcnt(0) 266; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc 267; GFX10-NEXT: s_waitcnt vmcnt(0) 268; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] 269; GFX10-NEXT: v_mov_b32_e32 v2, 0 270; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 271; GFX10-NEXT: s_endpgm 272 %tid = call i32 @llvm.amdgcn.workitem.id.x() 273 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 274 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 275 276 %a = load volatile double, double addrspace(1)* %gep.0, align 8 277 %b = load volatile double, double addrspace(1)* %gep.1, align 8 278 279 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 280 %result0 = extractvalue { double, i1 } %result, 0 281 store double %result0, double addrspace(1)* %out, align 8 282 ret void 283} 284 285define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], float %a) { 286; GFX7-LABEL: test_div_scale_f32_scalar_num_1: 287; GFX7: ; %bb.0: 288; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 289; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 290; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 291; GFX7-NEXT: v_mov_b32_e32 v1, 0 292; GFX7-NEXT: s_mov_b32 s2, 0 293; GFX7-NEXT: s_mov_b32 s3, 0xf000 294; GFX7-NEXT: s_waitcnt lgkmcnt(0) 295; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 296; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 297; GFX7-NEXT: s_mov_b32 s2, -1 298; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 299; GFX7-NEXT: s_waitcnt vmcnt(0) 300; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s8 301; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 302; GFX7-NEXT: s_endpgm 303; 304; GFX8-LABEL: test_div_scale_f32_scalar_num_1: 305; GFX8: ; %bb.0: 306; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 307; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 308; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 309; GFX8-NEXT: s_waitcnt lgkmcnt(0) 310; GFX8-NEXT: v_mov_b32_e32 v0, s6 311; GFX8-NEXT: v_mov_b32_e32 v1, s7 312; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 313; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 314; GFX8-NEXT: flat_load_dword v0, v[0:1] 315; GFX8-NEXT: s_waitcnt vmcnt(0) 316; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 317; GFX8-NEXT: v_mov_b32_e32 v0, s4 318; GFX8-NEXT: v_mov_b32_e32 v1, s5 319; GFX8-NEXT: flat_store_dword v[0:1], v2 320; GFX8-NEXT: s_endpgm 321; 322; GFX10-LABEL: test_div_scale_f32_scalar_num_1: 323; GFX10: ; %bb.0: 324; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 325; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 326; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 327; GFX10-NEXT: v_mov_b32_e32 v1, 0 328; GFX10-NEXT: s_waitcnt lgkmcnt(0) 329; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 330; GFX10-NEXT: s_waitcnt vmcnt(0) 331; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0 332; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 333; GFX10-NEXT: s_endpgm 334 %tid = call i32 @llvm.amdgcn.workitem.id.x() 335 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 336 337 %b = load float, float addrspace(1)* %gep, align 4 338 339 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 340 %result0 = extractvalue { float, i1 } %result, 0 341 store float %result0, float addrspace(1)* %out, align 4 342 ret void 343} 344 345define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) { 346; GFX7-LABEL: test_div_scale_f32_scalar_num_2: 347; GFX7: ; %bb.0: 348; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 349; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd 350; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 351; GFX7-NEXT: v_mov_b32_e32 v1, 0 352; GFX7-NEXT: s_mov_b32 s2, 0 353; GFX7-NEXT: s_mov_b32 s3, 0xf000 354; GFX7-NEXT: s_waitcnt lgkmcnt(0) 355; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 356; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 357; GFX7-NEXT: s_mov_b32 s2, -1 358; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 359; GFX7-NEXT: s_waitcnt vmcnt(0) 360; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, v0, s8 361; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 362; GFX7-NEXT: s_endpgm 363; 364; GFX8-LABEL: test_div_scale_f32_scalar_num_2: 365; GFX8: ; %bb.0: 366; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 367; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 368; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 369; GFX8-NEXT: s_waitcnt lgkmcnt(0) 370; GFX8-NEXT: v_mov_b32_e32 v0, s6 371; GFX8-NEXT: v_mov_b32_e32 v1, s7 372; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 373; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 374; GFX8-NEXT: flat_load_dword v0, v[0:1] 375; GFX8-NEXT: s_waitcnt vmcnt(0) 376; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 377; GFX8-NEXT: v_mov_b32_e32 v0, s4 378; GFX8-NEXT: v_mov_b32_e32 v1, s5 379; GFX8-NEXT: flat_store_dword v[0:1], v2 380; GFX8-NEXT: s_endpgm 381; 382; GFX10-LABEL: test_div_scale_f32_scalar_num_2: 383; GFX10: ; %bb.0: 384; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 385; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 386; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 387; GFX10-NEXT: v_mov_b32_e32 v1, 0 388; GFX10-NEXT: s_waitcnt lgkmcnt(0) 389; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 390; GFX10-NEXT: s_waitcnt vmcnt(0) 391; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0 392; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 393; GFX10-NEXT: s_endpgm 394 %tid = call i32 @llvm.amdgcn.workitem.id.x() 395 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 396 397 %b = load float, float addrspace(1)* %gep, align 4 398 399 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 400 %result0 = extractvalue { float, i1 } %result, 0 401 store float %result0, float addrspace(1)* %out, align 4 402 ret void 403} 404 405define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) { 406; GFX7-LABEL: test_div_scale_f32_scalar_den_1: 407; GFX7: ; %bb.0: 408; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 409; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd 410; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 411; GFX7-NEXT: v_mov_b32_e32 v1, 0 412; GFX7-NEXT: s_mov_b32 s2, 0 413; GFX7-NEXT: s_mov_b32 s3, 0xf000 414; GFX7-NEXT: s_waitcnt lgkmcnt(0) 415; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 416; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 417; GFX7-NEXT: s_mov_b32 s2, -1 418; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 419; GFX7-NEXT: s_waitcnt vmcnt(0) 420; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, s8, v0 421; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 422; GFX7-NEXT: s_endpgm 423; 424; GFX8-LABEL: test_div_scale_f32_scalar_den_1: 425; GFX8: ; %bb.0: 426; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 427; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 428; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 429; GFX8-NEXT: s_waitcnt lgkmcnt(0) 430; GFX8-NEXT: v_mov_b32_e32 v0, s6 431; GFX8-NEXT: v_mov_b32_e32 v1, s7 432; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 433; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 434; GFX8-NEXT: flat_load_dword v0, v[0:1] 435; GFX8-NEXT: s_waitcnt vmcnt(0) 436; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 437; GFX8-NEXT: v_mov_b32_e32 v0, s4 438; GFX8-NEXT: v_mov_b32_e32 v1, s5 439; GFX8-NEXT: flat_store_dword v[0:1], v2 440; GFX8-NEXT: s_endpgm 441; 442; GFX10-LABEL: test_div_scale_f32_scalar_den_1: 443; GFX10: ; %bb.0: 444; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 445; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 446; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 447; GFX10-NEXT: v_mov_b32_e32 v1, 0 448; GFX10-NEXT: s_waitcnt lgkmcnt(0) 449; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 450; GFX10-NEXT: s_waitcnt vmcnt(0) 451; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0 452; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 453; GFX10-NEXT: s_endpgm 454 %tid = call i32 @llvm.amdgcn.workitem.id.x() 455 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 456 457 %a = load float, float addrspace(1)* %gep, align 4 458 459 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 460 %result0 = extractvalue { float, i1 } %result, 0 461 store float %result0, float addrspace(1)* %out, align 4 462 ret void 463} 464 465define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) { 466; GFX7-LABEL: test_div_scale_f32_scalar_den_2: 467; GFX7: ; %bb.0: 468; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 469; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd 470; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 471; GFX7-NEXT: v_mov_b32_e32 v1, 0 472; GFX7-NEXT: s_mov_b32 s2, 0 473; GFX7-NEXT: s_mov_b32 s3, 0xf000 474; GFX7-NEXT: s_waitcnt lgkmcnt(0) 475; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 476; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 477; GFX7-NEXT: s_mov_b32 s2, -1 478; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 479; GFX7-NEXT: s_waitcnt vmcnt(0) 480; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, s8, v0 481; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 482; GFX7-NEXT: s_endpgm 483; 484; GFX8-LABEL: test_div_scale_f32_scalar_den_2: 485; GFX8: ; %bb.0: 486; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 487; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 488; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 489; GFX8-NEXT: s_waitcnt lgkmcnt(0) 490; GFX8-NEXT: v_mov_b32_e32 v0, s6 491; GFX8-NEXT: v_mov_b32_e32 v1, s7 492; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 493; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 494; GFX8-NEXT: flat_load_dword v0, v[0:1] 495; GFX8-NEXT: s_waitcnt vmcnt(0) 496; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 497; GFX8-NEXT: v_mov_b32_e32 v0, s4 498; GFX8-NEXT: v_mov_b32_e32 v1, s5 499; GFX8-NEXT: flat_store_dword v[0:1], v2 500; GFX8-NEXT: s_endpgm 501; 502; GFX10-LABEL: test_div_scale_f32_scalar_den_2: 503; GFX10: ; %bb.0: 504; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 505; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 506; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 507; GFX10-NEXT: v_mov_b32_e32 v1, 0 508; GFX10-NEXT: s_waitcnt lgkmcnt(0) 509; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 510; GFX10-NEXT: s_waitcnt vmcnt(0) 511; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0 512; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 513; GFX10-NEXT: s_endpgm 514 %tid = call i32 @llvm.amdgcn.workitem.id.x() 515 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 516 517 %a = load float, float addrspace(1)* %gep, align 4 518 519 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 520 %result0 = extractvalue { float, i1 } %result, 0 521 store float %result0, float addrspace(1)* %out, align 4 522 ret void 523} 524 525define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %a) { 526; GFX7-LABEL: test_div_scale_f64_scalar_num_1: 527; GFX7: ; %bb.0: 528; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 529; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 530; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 531; GFX7-NEXT: s_waitcnt lgkmcnt(0) 532; GFX7-NEXT: v_mov_b32_e32 v0, s6 533; GFX7-NEXT: v_mov_b32_e32 v1, s7 534; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 535; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 536; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 537; GFX7-NEXT: v_mov_b32_e32 v2, s4 538; GFX7-NEXT: v_mov_b32_e32 v3, s5 539; GFX7-NEXT: s_waitcnt vmcnt(0) 540; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] 541; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 542; GFX7-NEXT: s_endpgm 543; 544; GFX8-LABEL: test_div_scale_f64_scalar_num_1: 545; GFX8: ; %bb.0: 546; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 547; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 548; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 549; GFX8-NEXT: s_waitcnt lgkmcnt(0) 550; GFX8-NEXT: v_mov_b32_e32 v0, s6 551; GFX8-NEXT: v_mov_b32_e32 v1, s7 552; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 553; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 554; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 555; GFX8-NEXT: v_mov_b32_e32 v2, s4 556; GFX8-NEXT: v_mov_b32_e32 v3, s5 557; GFX8-NEXT: s_waitcnt vmcnt(0) 558; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] 559; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 560; GFX8-NEXT: s_endpgm 561; 562; GFX10-LABEL: test_div_scale_f64_scalar_num_1: 563; GFX10: ; %bb.0: 564; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 565; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 566; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 567; GFX10-NEXT: v_mov_b32_e32 v2, 0 568; GFX10-NEXT: s_waitcnt lgkmcnt(0) 569; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 570; GFX10-NEXT: s_waitcnt vmcnt(0) 571; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] 572; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 573; GFX10-NEXT: s_endpgm 574 %tid = call i32 @llvm.amdgcn.workitem.id.x() 575 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 576 577 %b = load double, double addrspace(1)* %gep, align 8 578 579 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 580 %result0 = extractvalue { double, i1 } %result, 0 581 store double %result0, double addrspace(1)* %out, align 8 582 ret void 583} 584 585define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %a) { 586; GFX7-LABEL: test_div_scale_f64_scalar_num_2: 587; GFX7: ; %bb.0: 588; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 589; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 590; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 591; GFX7-NEXT: s_waitcnt lgkmcnt(0) 592; GFX7-NEXT: v_mov_b32_e32 v0, s6 593; GFX7-NEXT: v_mov_b32_e32 v1, s7 594; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 595; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 596; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 597; GFX7-NEXT: v_mov_b32_e32 v2, s4 598; GFX7-NEXT: v_mov_b32_e32 v3, s5 599; GFX7-NEXT: s_waitcnt vmcnt(0) 600; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] 601; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 602; GFX7-NEXT: s_endpgm 603; 604; GFX8-LABEL: test_div_scale_f64_scalar_num_2: 605; GFX8: ; %bb.0: 606; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 607; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 608; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 609; GFX8-NEXT: s_waitcnt lgkmcnt(0) 610; GFX8-NEXT: v_mov_b32_e32 v0, s6 611; GFX8-NEXT: v_mov_b32_e32 v1, s7 612; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 613; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 614; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 615; GFX8-NEXT: v_mov_b32_e32 v2, s4 616; GFX8-NEXT: v_mov_b32_e32 v3, s5 617; GFX8-NEXT: s_waitcnt vmcnt(0) 618; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] 619; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 620; GFX8-NEXT: s_endpgm 621; 622; GFX10-LABEL: test_div_scale_f64_scalar_num_2: 623; GFX10: ; %bb.0: 624; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 625; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 626; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 627; GFX10-NEXT: v_mov_b32_e32 v2, 0 628; GFX10-NEXT: s_waitcnt lgkmcnt(0) 629; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 630; GFX10-NEXT: s_waitcnt vmcnt(0) 631; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] 632; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 633; GFX10-NEXT: s_endpgm 634 %tid = call i32 @llvm.amdgcn.workitem.id.x() 635 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 636 637 %b = load double, double addrspace(1)* %gep, align 8 638 639 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 640 %result0 = extractvalue { double, i1 } %result, 0 641 store double %result0, double addrspace(1)* %out, align 8 642 ret void 643} 644 645define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) { 646; GFX7-LABEL: test_div_scale_f64_scalar_den_1: 647; GFX7: ; %bb.0: 648; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 649; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 650; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 651; GFX7-NEXT: s_waitcnt lgkmcnt(0) 652; GFX7-NEXT: v_mov_b32_e32 v0, s6 653; GFX7-NEXT: v_mov_b32_e32 v1, s7 654; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 655; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 656; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 657; GFX7-NEXT: v_mov_b32_e32 v2, s4 658; GFX7-NEXT: v_mov_b32_e32 v3, s5 659; GFX7-NEXT: s_waitcnt vmcnt(0) 660; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] 661; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 662; GFX7-NEXT: s_endpgm 663; 664; GFX8-LABEL: test_div_scale_f64_scalar_den_1: 665; GFX8: ; %bb.0: 666; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 667; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 668; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 669; GFX8-NEXT: s_waitcnt lgkmcnt(0) 670; GFX8-NEXT: v_mov_b32_e32 v0, s6 671; GFX8-NEXT: v_mov_b32_e32 v1, s7 672; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 673; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 674; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 675; GFX8-NEXT: v_mov_b32_e32 v2, s4 676; GFX8-NEXT: v_mov_b32_e32 v3, s5 677; GFX8-NEXT: s_waitcnt vmcnt(0) 678; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] 679; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 680; GFX8-NEXT: s_endpgm 681; 682; GFX10-LABEL: test_div_scale_f64_scalar_den_1: 683; GFX10: ; %bb.0: 684; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 685; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 686; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 687; GFX10-NEXT: v_mov_b32_e32 v2, 0 688; GFX10-NEXT: s_waitcnt lgkmcnt(0) 689; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 690; GFX10-NEXT: s_waitcnt vmcnt(0) 691; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] 692; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 693; GFX10-NEXT: s_endpgm 694 %tid = call i32 @llvm.amdgcn.workitem.id.x() 695 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 696 697 %a = load double, double addrspace(1)* %gep, align 8 698 699 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 700 %result0 = extractvalue { double, i1 } %result, 0 701 store double %result0, double addrspace(1)* %out, align 8 702 ret void 703} 704 705define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) { 706; GFX7-LABEL: test_div_scale_f64_scalar_den_2: 707; GFX7: ; %bb.0: 708; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 709; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 710; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 711; GFX7-NEXT: s_waitcnt lgkmcnt(0) 712; GFX7-NEXT: v_mov_b32_e32 v0, s6 713; GFX7-NEXT: v_mov_b32_e32 v1, s7 714; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 715; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 716; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 717; GFX7-NEXT: v_mov_b32_e32 v2, s4 718; GFX7-NEXT: v_mov_b32_e32 v3, s5 719; GFX7-NEXT: s_waitcnt vmcnt(0) 720; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] 721; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 722; GFX7-NEXT: s_endpgm 723; 724; GFX8-LABEL: test_div_scale_f64_scalar_den_2: 725; GFX8: ; %bb.0: 726; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 727; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 728; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 729; GFX8-NEXT: s_waitcnt lgkmcnt(0) 730; GFX8-NEXT: v_mov_b32_e32 v0, s6 731; GFX8-NEXT: v_mov_b32_e32 v1, s7 732; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 733; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 734; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 735; GFX8-NEXT: v_mov_b32_e32 v2, s4 736; GFX8-NEXT: v_mov_b32_e32 v3, s5 737; GFX8-NEXT: s_waitcnt vmcnt(0) 738; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] 739; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 740; GFX8-NEXT: s_endpgm 741; 742; GFX10-LABEL: test_div_scale_f64_scalar_den_2: 743; GFX10: ; %bb.0: 744; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 745; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 746; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 747; GFX10-NEXT: v_mov_b32_e32 v2, 0 748; GFX10-NEXT: s_waitcnt lgkmcnt(0) 749; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 750; GFX10-NEXT: s_waitcnt vmcnt(0) 751; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] 752; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 753; GFX10-NEXT: s_endpgm 754 %tid = call i32 @llvm.amdgcn.workitem.id.x() 755 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 756 757 %a = load double, double addrspace(1)* %gep, align 8 758 759 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 760 %result0 = extractvalue { double, i1 } %result, 0 761 store double %result0, double addrspace(1)* %out, align 8 762 ret void 763} 764 765define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) { 766; GFX7-LABEL: test_div_scale_f32_all_scalar_1: 767; GFX7: ; %bb.0: 768; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 769; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 770; GFX7-NEXT: s_load_dword s0, s[0:1], 0x1c 771; GFX7-NEXT: s_mov_b32 s6, -1 772; GFX7-NEXT: s_mov_b32 s7, 0xf000 773; GFX7-NEXT: s_waitcnt lgkmcnt(0) 774; GFX7-NEXT: v_mov_b32_e32 v0, s0 775; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s2 776; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 777; GFX7-NEXT: s_endpgm 778; 779; GFX8-LABEL: test_div_scale_f32_all_scalar_1: 780; GFX8: ; %bb.0: 781; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 782; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 783; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 784; GFX8-NEXT: s_waitcnt lgkmcnt(0) 785; GFX8-NEXT: v_mov_b32_e32 v0, s3 786; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s2 787; GFX8-NEXT: v_mov_b32_e32 v0, s0 788; GFX8-NEXT: v_mov_b32_e32 v1, s1 789; GFX8-NEXT: flat_store_dword v[0:1], v2 790; GFX8-NEXT: s_endpgm 791; 792; GFX10-LABEL: test_div_scale_f32_all_scalar_1: 793; GFX10: ; %bb.0: 794; GFX10-NEXT: s_clause 0x2 795; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c 796; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 797; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 798; GFX10-NEXT: v_mov_b32_e32 v1, 0 799; GFX10-NEXT: s_waitcnt lgkmcnt(0) 800; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 801; GFX10-NEXT: global_store_dword v1, v0, s[2:3] 802; GFX10-NEXT: s_endpgm 803 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 804 %result0 = extractvalue { float, i1 } %result, 0 805 store float %result0, float addrspace(1)* %out, align 4 806 ret void 807} 808 809define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) { 810; GFX7-LABEL: test_div_scale_f32_all_scalar_2: 811; GFX7: ; %bb.0: 812; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 813; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 814; GFX7-NEXT: s_load_dword s0, s[0:1], 0x1c 815; GFX7-NEXT: s_mov_b32 s6, -1 816; GFX7-NEXT: s_mov_b32 s7, 0xf000 817; GFX7-NEXT: s_waitcnt lgkmcnt(0) 818; GFX7-NEXT: v_mov_b32_e32 v0, s0 819; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s2, v0, s2 820; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 821; GFX7-NEXT: s_endpgm 822; 823; GFX8-LABEL: test_div_scale_f32_all_scalar_2: 824; GFX8: ; %bb.0: 825; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 826; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 827; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 828; GFX8-NEXT: s_waitcnt lgkmcnt(0) 829; GFX8-NEXT: v_mov_b32_e32 v0, s3 830; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s2, v0, s2 831; GFX8-NEXT: v_mov_b32_e32 v0, s0 832; GFX8-NEXT: v_mov_b32_e32 v1, s1 833; GFX8-NEXT: flat_store_dword v[0:1], v2 834; GFX8-NEXT: s_endpgm 835; 836; GFX10-LABEL: test_div_scale_f32_all_scalar_2: 837; GFX10: ; %bb.0: 838; GFX10-NEXT: s_clause 0x2 839; GFX10-NEXT: s_load_dword s4, s[0:1], 0x4c 840; GFX10-NEXT: s_load_dword s5, s[0:1], 0x70 841; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 842; GFX10-NEXT: v_mov_b32_e32 v1, 0 843; GFX10-NEXT: s_waitcnt lgkmcnt(0) 844; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 845; GFX10-NEXT: global_store_dword v1, v0, s[2:3] 846; GFX10-NEXT: s_endpgm 847 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 848 %result0 = extractvalue { float, i1 } %result, 0 849 store float %result0, float addrspace(1)* %out, align 4 850 ret void 851} 852 853define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) { 854; GFX7-LABEL: test_div_scale_f64_all_scalar_1: 855; GFX7: ; %bb.0: 856; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 857; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 858; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 859; GFX7-NEXT: s_waitcnt lgkmcnt(0) 860; GFX7-NEXT: v_mov_b32_e32 v0, s4 861; GFX7-NEXT: v_mov_b32_e32 v1, s5 862; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 863; GFX7-NEXT: v_mov_b32_e32 v3, s1 864; GFX7-NEXT: v_mov_b32_e32 v2, s0 865; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 866; GFX7-NEXT: s_endpgm 867; 868; GFX8-LABEL: test_div_scale_f64_all_scalar_1: 869; GFX8: ; %bb.0: 870; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 871; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 872; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 873; GFX8-NEXT: s_waitcnt lgkmcnt(0) 874; GFX8-NEXT: v_mov_b32_e32 v0, s4 875; GFX8-NEXT: v_mov_b32_e32 v1, s5 876; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 877; GFX8-NEXT: v_mov_b32_e32 v3, s1 878; GFX8-NEXT: v_mov_b32_e32 v2, s0 879; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 880; GFX8-NEXT: s_endpgm 881; 882; GFX10-LABEL: test_div_scale_f64_all_scalar_1: 883; GFX10: ; %bb.0: 884; GFX10-NEXT: s_clause 0x1 885; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 886; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 887; GFX10-NEXT: v_mov_b32_e32 v2, 0 888; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 889; GFX10-NEXT: s_waitcnt lgkmcnt(0) 890; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] 891; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 892; GFX10-NEXT: s_endpgm 893 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 894 %result0 = extractvalue { double, i1 } %result, 0 895 store double %result0, double addrspace(1)* %out, align 8 896 ret void 897} 898 899define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) { 900; GFX7-LABEL: test_div_scale_f64_all_scalar_2: 901; GFX7: ; %bb.0: 902; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 903; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 904; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 905; GFX7-NEXT: s_waitcnt lgkmcnt(0) 906; GFX7-NEXT: v_mov_b32_e32 v0, s4 907; GFX7-NEXT: v_mov_b32_e32 v1, s5 908; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3] 909; GFX7-NEXT: v_mov_b32_e32 v3, s1 910; GFX7-NEXT: v_mov_b32_e32 v2, s0 911; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 912; GFX7-NEXT: s_endpgm 913; 914; GFX8-LABEL: test_div_scale_f64_all_scalar_2: 915; GFX8: ; %bb.0: 916; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 917; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 918; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 919; GFX8-NEXT: s_waitcnt lgkmcnt(0) 920; GFX8-NEXT: v_mov_b32_e32 v0, s4 921; GFX8-NEXT: v_mov_b32_e32 v1, s5 922; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3] 923; GFX8-NEXT: v_mov_b32_e32 v3, s1 924; GFX8-NEXT: v_mov_b32_e32 v2, s0 925; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 926; GFX8-NEXT: s_endpgm 927; 928; GFX10-LABEL: test_div_scale_f64_all_scalar_2: 929; GFX10: ; %bb.0: 930; GFX10-NEXT: s_clause 0x1 931; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 932; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 933; GFX10-NEXT: v_mov_b32_e32 v2, 0 934; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 935; GFX10-NEXT: s_waitcnt lgkmcnt(0) 936; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] 937; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 938; GFX10-NEXT: s_endpgm 939 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 940 %result0 = extractvalue { double, i1 } %result, 0 941 store double %result0, double addrspace(1)* %out, align 8 942 ret void 943} 944 945define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) { 946; GFX7-LABEL: test_div_scale_f32_inline_imm_num: 947; GFX7: ; %bb.0: 948; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 949; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 950; GFX7-NEXT: v_mov_b32_e32 v1, 0 951; GFX7-NEXT: s_mov_b32 s6, 0 952; GFX7-NEXT: s_mov_b32 s7, 0xf000 953; GFX7-NEXT: s_waitcnt lgkmcnt(0) 954; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 955; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 956; GFX7-NEXT: s_mov_b32 s6, -1 957; GFX7-NEXT: s_waitcnt vmcnt(0) 958; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, 1.0 959; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 960; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 961; GFX7-NEXT: s_endpgm 962; 963; GFX8-LABEL: test_div_scale_f32_inline_imm_num: 964; GFX8: ; %bb.0: 965; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 966; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 967; GFX8-NEXT: s_waitcnt lgkmcnt(0) 968; GFX8-NEXT: v_mov_b32_e32 v0, s2 969; GFX8-NEXT: v_mov_b32_e32 v1, s3 970; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 971; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 972; GFX8-NEXT: flat_load_dword v0, v[0:1] 973; GFX8-NEXT: s_waitcnt vmcnt(0) 974; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 975; GFX8-NEXT: v_mov_b32_e32 v0, s0 976; GFX8-NEXT: v_mov_b32_e32 v1, s1 977; GFX8-NEXT: flat_store_dword v[0:1], v2 978; GFX8-NEXT: s_endpgm 979; 980; GFX10-LABEL: test_div_scale_f32_inline_imm_num: 981; GFX10: ; %bb.0: 982; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 983; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 984; GFX10-NEXT: v_mov_b32_e32 v1, 0 985; GFX10-NEXT: s_waitcnt lgkmcnt(0) 986; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 987; GFX10-NEXT: s_waitcnt vmcnt(0) 988; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 989; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 990; GFX10-NEXT: s_endpgm 991 %tid = call i32 @llvm.amdgcn.workitem.id.x() 992 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 993 %a = load float, float addrspace(1)* %gep.0, align 4 994 995 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false) 996 %result0 = extractvalue { float, i1 } %result, 0 997 store float %result0, float addrspace(1)* %out, align 4 998 ret void 999} 1000 1001define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) { 1002; GFX7-LABEL: test_div_scale_f32_inline_imm_den: 1003; GFX7: ; %bb.0: 1004; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1005; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1006; GFX7-NEXT: v_mov_b32_e32 v1, 0 1007; GFX7-NEXT: s_mov_b32 s6, 0 1008; GFX7-NEXT: s_mov_b32 s7, 0xf000 1009; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 1011; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1012; GFX7-NEXT: s_mov_b32 s6, -1 1013; GFX7-NEXT: s_waitcnt vmcnt(0) 1014; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], 2.0, 2.0, v0 1015; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 1016; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1017; GFX7-NEXT: s_endpgm 1018; 1019; GFX8-LABEL: test_div_scale_f32_inline_imm_den: 1020; GFX8: ; %bb.0: 1021; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1022; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1023; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX8-NEXT: v_mov_b32_e32 v0, s2 1025; GFX8-NEXT: v_mov_b32_e32 v1, s3 1026; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1027; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1028; GFX8-NEXT: flat_load_dword v0, v[0:1] 1029; GFX8-NEXT: s_waitcnt vmcnt(0) 1030; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 1031; GFX8-NEXT: v_mov_b32_e32 v0, s0 1032; GFX8-NEXT: v_mov_b32_e32 v1, s1 1033; GFX8-NEXT: flat_store_dword v[0:1], v2 1034; GFX8-NEXT: s_endpgm 1035; 1036; GFX10-LABEL: test_div_scale_f32_inline_imm_den: 1037; GFX10: ; %bb.0: 1038; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1039; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1040; GFX10-NEXT: v_mov_b32_e32 v1, 0 1041; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1042; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1043; GFX10-NEXT: s_waitcnt vmcnt(0) 1044; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 1045; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1046; GFX10-NEXT: s_endpgm 1047 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1048 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 1049 %a = load float, float addrspace(1)* %gep.0, align 4 1050 1051 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false) 1052 %result0 = extractvalue { float, i1 } %result, 0 1053 store float %result0, float addrspace(1)* %out, align 4 1054 ret void 1055} 1056 1057define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) { 1058; GFX7-LABEL: test_div_scale_f32_fabs_num: 1059; GFX7: ; %bb.0: 1060; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1061; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1062; GFX7-NEXT: v_mov_b32_e32 v1, 0 1063; GFX7-NEXT: s_mov_b32 s6, 0 1064; GFX7-NEXT: s_mov_b32 s7, 0xf000 1065; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1066; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 1067; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 1068; GFX7-NEXT: s_waitcnt vmcnt(0) 1069; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc 1070; GFX7-NEXT: s_waitcnt vmcnt(0) 1071; GFX7-NEXT: s_mov_b32 s6, -1 1072; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 1073; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v1 1074; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 1075; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1076; GFX7-NEXT: s_endpgm 1077; 1078; GFX8-LABEL: test_div_scale_f32_fabs_num: 1079; GFX8: ; %bb.0: 1080; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1081; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1082; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX8-NEXT: v_mov_b32_e32 v0, s2 1084; GFX8-NEXT: v_mov_b32_e32 v1, s3 1085; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1086; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1087; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 1088; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1089; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 1090; GFX8-NEXT: s_waitcnt vmcnt(0) 1091; GFX8-NEXT: flat_load_dword v1, v[2:3] glc 1092; GFX8-NEXT: s_waitcnt vmcnt(0) 1093; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 1094; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 1095; GFX8-NEXT: v_mov_b32_e32 v0, s0 1096; GFX8-NEXT: v_mov_b32_e32 v1, s1 1097; GFX8-NEXT: flat_store_dword v[0:1], v2 1098; GFX8-NEXT: s_endpgm 1099; 1100; GFX10-LABEL: test_div_scale_f32_fabs_num: 1101; GFX10: ; %bb.0: 1102; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1103; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1104; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1105; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 1106; GFX10-NEXT: s_waitcnt vmcnt(0) 1107; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 1108; GFX10-NEXT: s_waitcnt vmcnt(0) 1109; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 1110; GFX10-NEXT: v_mov_b32_e32 v1, 0 1111; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 1112; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1113; GFX10-NEXT: s_endpgm 1114 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1115 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 1116 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 1117 1118 %a = load volatile float, float addrspace(1)* %gep.0, align 4 1119 %b = load volatile float, float addrspace(1)* %gep.1, align 4 1120 1121 %a.fabs = call float @llvm.fabs.f32(float %a) 1122 1123 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false) 1124 %result0 = extractvalue { float, i1 } %result, 0 1125 store float %result0, float addrspace(1)* %out, align 4 1126 ret void 1127} 1128 1129define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) { 1130; GFX7-LABEL: test_div_scale_f32_fabs_den: 1131; GFX7: ; %bb.0: 1132; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1133; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1134; GFX7-NEXT: v_mov_b32_e32 v1, 0 1135; GFX7-NEXT: s_mov_b32 s6, 0 1136; GFX7-NEXT: s_mov_b32 s7, 0xf000 1137; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 1139; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 1140; GFX7-NEXT: s_waitcnt vmcnt(0) 1141; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc 1142; GFX7-NEXT: s_waitcnt vmcnt(0) 1143; GFX7-NEXT: s_mov_b32 s6, -1 1144; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 1145; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 1146; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 1147; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1148; GFX7-NEXT: s_endpgm 1149; 1150; GFX8-LABEL: test_div_scale_f32_fabs_den: 1151; GFX8: ; %bb.0: 1152; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1153; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1154; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1155; GFX8-NEXT: v_mov_b32_e32 v0, s2 1156; GFX8-NEXT: v_mov_b32_e32 v1, s3 1157; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1158; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1159; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 1160; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1161; GFX8-NEXT: flat_load_dword v0, v[0:1] glc 1162; GFX8-NEXT: s_waitcnt vmcnt(0) 1163; GFX8-NEXT: flat_load_dword v1, v[2:3] glc 1164; GFX8-NEXT: s_waitcnt vmcnt(0) 1165; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 1166; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 1167; GFX8-NEXT: v_mov_b32_e32 v0, s0 1168; GFX8-NEXT: v_mov_b32_e32 v1, s1 1169; GFX8-NEXT: flat_store_dword v[0:1], v2 1170; GFX8-NEXT: s_endpgm 1171; 1172; GFX10-LABEL: test_div_scale_f32_fabs_den: 1173; GFX10: ; %bb.0: 1174; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1175; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1176; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1177; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 1178; GFX10-NEXT: s_waitcnt vmcnt(0) 1179; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 1180; GFX10-NEXT: s_waitcnt vmcnt(0) 1181; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 1182; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 1183; GFX10-NEXT: v_mov_b32_e32 v1, 0 1184; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1185; GFX10-NEXT: s_endpgm 1186 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1187 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 1188 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 1189 1190 %a = load volatile float, float addrspace(1)* %gep.0, align 4 1191 %b = load volatile float, float addrspace(1)* %gep.1, align 4 1192 1193 %b.fabs = call float @llvm.fabs.f32(float %b) 1194 1195 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false) 1196 %result0 = extractvalue { float, i1 } %result, 0 1197 store float %result0, float addrspace(1)* %out, align 4 1198 ret void 1199} 1200 1201define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 { 1202; GFX7-LABEL: test_div_scale_f32_val_undef_val: 1203; GFX7: ; %bb.0: 1204; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1205; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 1206; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1207; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 1208; GFX7-NEXT: s_mov_b32 s2, -1 1209; GFX7-NEXT: s_mov_b32 s3, 0xf000 1210; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1211; GFX7-NEXT: s_endpgm 1212; 1213; GFX8-LABEL: test_div_scale_f32_val_undef_val: 1214; GFX8: ; %bb.0: 1215; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 1216; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0 1217; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1218; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1219; GFX8-NEXT: v_mov_b32_e32 v0, s0 1220; GFX8-NEXT: v_mov_b32_e32 v1, s1 1221; GFX8-NEXT: flat_store_dword v[0:1], v2 1222; GFX8-NEXT: s_endpgm 1223; 1224; GFX10-LABEL: test_div_scale_f32_val_undef_val: 1225; GFX10: ; %bb.0: 1226; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1227; GFX10-NEXT: v_mov_b32_e32 v1, 0 1228; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 1230; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1231; GFX10-NEXT: s_endpgm 1232 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) 1233 %result0 = extractvalue { float, i1 } %result, 0 1234 store float %result0, float addrspace(1)* %out, align 4 1235 ret void 1236} 1237 1238define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 { 1239; GFX7-LABEL: test_div_scale_f32_undef_val_val: 1240; GFX7: ; %bb.0: 1241; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1242; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 1243; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1244; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 1245; GFX7-NEXT: s_mov_b32 s2, -1 1246; GFX7-NEXT: s_mov_b32 s3, 0xf000 1247; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1248; GFX7-NEXT: s_endpgm 1249; 1250; GFX8-LABEL: test_div_scale_f32_undef_val_val: 1251; GFX8: ; %bb.0: 1252; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 1253; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0 1254; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1255; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX8-NEXT: v_mov_b32_e32 v0, s0 1257; GFX8-NEXT: v_mov_b32_e32 v1, s1 1258; GFX8-NEXT: flat_store_dword v[0:1], v2 1259; GFX8-NEXT: s_endpgm 1260; 1261; GFX10-LABEL: test_div_scale_f32_undef_val_val: 1262; GFX10: ; %bb.0: 1263; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1264; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 1265; GFX10-NEXT: v_mov_b32_e32 v1, 0 1266; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1267; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0 1268; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1269; GFX10-NEXT: s_endpgm 1270 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) 1271 %result0 = extractvalue { float, i1 } %result, 0 1272 store float %result0, float addrspace(1)* %out, align 4 1273 ret void 1274} 1275 1276define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 { 1277; GFX7-LABEL: test_div_scale_f32_undef_undef_val: 1278; GFX7: ; %bb.0: 1279; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1280; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1281; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 1282; GFX7-NEXT: s_mov_b32 s2, -1 1283; GFX7-NEXT: s_mov_b32 s3, 0xf000 1284; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1285; GFX7-NEXT: s_endpgm 1286; 1287; GFX8-LABEL: test_div_scale_f32_undef_undef_val: 1288; GFX8: ; %bb.0: 1289; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0 1290; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1291; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1292; GFX8-NEXT: v_mov_b32_e32 v0, s0 1293; GFX8-NEXT: v_mov_b32_e32 v1, s1 1294; GFX8-NEXT: flat_store_dword v[0:1], v2 1295; GFX8-NEXT: s_endpgm 1296; 1297; GFX10-LABEL: test_div_scale_f32_undef_undef_val: 1298; GFX10: ; %bb.0: 1299; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1300; GFX10-NEXT: v_mov_b32_e32 v1, 0 1301; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1302; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 1303; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1304; GFX10-NEXT: s_endpgm 1305 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) 1306 %result0 = extractvalue { float, i1 } %result, 0 1307 store float %result0, float addrspace(1)* %out, align 4 1308 ret void 1309} 1310 1311define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 { 1312; GFX7-LABEL: test_div_scale_f64_val_undef_val: 1313; GFX7: ; %bb.0: 1314; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1315; GFX7-NEXT: s_mov_b32 s2, 0 1316; GFX7-NEXT: s_mov_b32 s3, 0x40200000 1317; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 1318; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1319; GFX7-NEXT: v_mov_b32_e32 v3, s1 1320; GFX7-NEXT: v_mov_b32_e32 v2, s0 1321; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1322; GFX7-NEXT: s_endpgm 1323; 1324; GFX8-LABEL: test_div_scale_f64_val_undef_val: 1325; GFX8: ; %bb.0: 1326; GFX8-NEXT: s_mov_b32 s2, 0 1327; GFX8-NEXT: s_mov_b32 s3, 0x40200000 1328; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 1329; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1330; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX8-NEXT: v_mov_b32_e32 v3, s1 1332; GFX8-NEXT: v_mov_b32_e32 v2, s0 1333; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1334; GFX8-NEXT: s_endpgm 1335; 1336; GFX10-LABEL: test_div_scale_f64_val_undef_val: 1337; GFX10: ; %bb.0: 1338; GFX10-NEXT: s_mov_b32 s2, 0 1339; GFX10-NEXT: s_mov_b32 s3, 0x40200000 1340; GFX10-NEXT: v_mov_b32_e32 v2, 0 1341; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3] 1342; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1343; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1344; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1345; GFX10-NEXT: s_endpgm 1346 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) 1347 %result0 = extractvalue { double, i1 } %result, 0 1348 store double %result0, double addrspace(1)* %out, align 8 1349 ret void 1350} 1351 1352declare i32 @llvm.amdgcn.workitem.id.x() #1 1353declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1 1354declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1 1355declare float @llvm.fabs.f32(float) #1 1356 1357attributes #0 = { nounwind } 1358attributes #1 = { nounwind readnone speculatable } 1359