1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s 3; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s 5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s 6 7define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { 8; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: 9; SI: ; %bb.0: 10; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 11; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 12; SI-NEXT: v_mov_b32_e32 v1, 0 13; SI-NEXT: s_mov_b32 s10, 0 14; SI-NEXT: s_mov_b32 s11, 0xf000 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 17; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 20; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc 21; SI-NEXT: s_waitcnt vmcnt(0) 22; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 23; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 26; SI-NEXT: v_med3_f32 v2, v2, v3, v4 27; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 28; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 29; SI-NEXT: s_endpgm 30; 31; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: 32; VI: ; %bb.0: 33; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 34; VI-NEXT: v_lshlrev_b32_e32 v8, 2, v0 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: v_mov_b32_e32 v0, s2 37; VI-NEXT: v_mov_b32_e32 v1, s3 38; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v8 39; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 40; VI-NEXT: v_mov_b32_e32 v2, s4 41; VI-NEXT: v_mov_b32_e32 v3, s5 42; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v8 43; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 44; VI-NEXT: v_mov_b32_e32 v4, s6 45; VI-NEXT: v_mov_b32_e32 v5, s7 46; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v8 47; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 48; VI-NEXT: flat_load_dword v0, v[0:1] glc 49; VI-NEXT: s_waitcnt vmcnt(0) 50; VI-NEXT: flat_load_dword v1, v[2:3] glc 51; VI-NEXT: s_waitcnt vmcnt(0) 52; VI-NEXT: flat_load_dword v2, v[4:5] glc 53; VI-NEXT: s_waitcnt vmcnt(0) 54; VI-NEXT: v_mov_b32_e32 v7, s1 55; VI-NEXT: v_mov_b32_e32 v6, s0 56; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 57; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc 58; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 59; VI-NEXT: v_med3_f32 v0, v0, v1, v2 60; VI-NEXT: flat_store_dword v[6:7], v0 61; VI-NEXT: s_endpgm 62; 63; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: 64; GFX9: ; %bb.0: 65; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 66; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 67; GFX9-NEXT: s_waitcnt lgkmcnt(0) 68; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 69; GFX9-NEXT: s_waitcnt vmcnt(0) 70; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc 71; GFX9-NEXT: s_waitcnt vmcnt(0) 72; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 73; GFX9-NEXT: s_waitcnt vmcnt(0) 74; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 75; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 76; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 77; GFX9-NEXT: s_endpgm 78; 79; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: 80; GFX10: ; %bb.0: 81; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 82; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 83; GFX10-NEXT: s_waitcnt lgkmcnt(0) 84; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 85; GFX10-NEXT: s_waitcnt vmcnt(0) 86; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc 87; GFX10-NEXT: s_waitcnt vmcnt(0) 88; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc 89; GFX10-NEXT: s_waitcnt vmcnt(0) 90; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 91; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 92; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 93; GFX10-NEXT: s_endpgm 94 %tid = call i32 @llvm.amdgcn.workitem.id.x() 95 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 96 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 97 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 98 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 99 %a = load volatile float, float addrspace(1)* %gep0 100 %b = load volatile float, float addrspace(1)* %gep1 101 %c = load volatile float, float addrspace(1)* %gep2 102 %a.fneg = fsub float -0.0, %a 103 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) 104 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) 105 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) 106 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 107 store float %med3, float addrspace(1)* %outgep 108 ret void 109} 110 111define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { 112; SI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: 113; SI: ; %bb.0: 114; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 115; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 116; SI-NEXT: v_mov_b32_e32 v1, 0 117; SI-NEXT: s_mov_b32 s10, 0 118; SI-NEXT: s_mov_b32 s11, 0xf000 119; SI-NEXT: s_waitcnt lgkmcnt(0) 120; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 121; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 122; SI-NEXT: s_waitcnt vmcnt(0) 123; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 124; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc 125; SI-NEXT: s_waitcnt vmcnt(0) 126; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 127; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc 128; SI-NEXT: s_waitcnt vmcnt(0) 129; SI-NEXT: v_sub_f32_e32 v2, 0x80000000, v2 130; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 131; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 132; SI-NEXT: v_min_f32_e32 v5, v2, v3 133; SI-NEXT: v_max_f32_e32 v2, v2, v3 134; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 135; SI-NEXT: v_min_f32_e32 v2, v2, v3 136; SI-NEXT: v_max_f32_e32 v2, v5, v2 137; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 138; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 139; SI-NEXT: s_endpgm 140; 141; VI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: 142; VI: ; %bb.0: 143; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 144; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 145; VI-NEXT: s_waitcnt lgkmcnt(0) 146; VI-NEXT: v_mov_b32_e32 v0, s2 147; VI-NEXT: v_mov_b32_e32 v1, s3 148; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 149; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 150; VI-NEXT: v_mov_b32_e32 v2, s4 151; VI-NEXT: v_mov_b32_e32 v3, s5 152; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 153; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 154; VI-NEXT: v_mov_b32_e32 v4, s6 155; VI-NEXT: v_mov_b32_e32 v5, s7 156; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 157; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 158; VI-NEXT: flat_load_dword v7, v[0:1] glc 159; VI-NEXT: s_waitcnt vmcnt(0) 160; VI-NEXT: flat_load_dword v2, v[2:3] glc 161; VI-NEXT: s_waitcnt vmcnt(0) 162; VI-NEXT: flat_load_dword v3, v[4:5] glc 163; VI-NEXT: s_waitcnt vmcnt(0) 164; VI-NEXT: v_mov_b32_e32 v0, s0 165; VI-NEXT: v_mov_b32_e32 v1, s1 166; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 167; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 168; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 169; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 170; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4 171; VI-NEXT: v_min_f32_e32 v5, v4, v2 172; VI-NEXT: v_max_f32_e32 v2, v4, v2 173; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 174; VI-NEXT: v_min_f32_e32 v2, v2, v3 175; VI-NEXT: v_max_f32_e32 v2, v5, v2 176; VI-NEXT: flat_store_dword v[0:1], v2 177; VI-NEXT: s_endpgm 178; 179; GFX9-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: 180; GFX9: ; %bb.0: 181; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 182; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 183; GFX9-NEXT: s_waitcnt lgkmcnt(0) 184; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 185; GFX9-NEXT: s_waitcnt vmcnt(0) 186; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc 187; GFX9-NEXT: s_waitcnt vmcnt(0) 188; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 189; GFX9-NEXT: s_waitcnt vmcnt(0) 190; GFX9-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 191; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 192; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 193; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 194; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 195; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 196; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 197; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 198; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 199; GFX9-NEXT: s_endpgm 200; 201; GFX10-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: 202; GFX10: ; %bb.0: 203; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 204; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX10-NEXT: s_waitcnt lgkmcnt(0) 206; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 207; GFX10-NEXT: s_waitcnt vmcnt(0) 208; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc 209; GFX10-NEXT: s_waitcnt vmcnt(0) 210; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc 211; GFX10-NEXT: s_waitcnt vmcnt(0) 212; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 213; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 214; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 215; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 216; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 217; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 218; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 219; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 220; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 221; GFX10-NEXT: s_endpgm 222 %tid = call i32 @llvm.amdgcn.workitem.id.x() 223 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 224 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 225 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 226 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 227 %a = load volatile float, float addrspace(1)* %gep0 228 %b = load volatile float, float addrspace(1)* %gep1 229 %c = load volatile float, float addrspace(1)* %gep2 230 %a.fneg = fsub float -0.0, %a 231 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b) 232 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b) 233 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) 234 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 235 store float %med3, float addrspace(1)* %outgep 236 ret void 237} 238 239define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { 240; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: 241; SI: ; %bb.0: 242; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 243; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 244; SI-NEXT: v_mov_b32_e32 v1, 0 245; SI-NEXT: s_mov_b32 s10, 0 246; SI-NEXT: s_mov_b32 s11, 0xf000 247; SI-NEXT: s_waitcnt lgkmcnt(0) 248; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 249; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 250; SI-NEXT: s_waitcnt vmcnt(0) 251; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 252; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc 253; SI-NEXT: s_waitcnt vmcnt(0) 254; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 255; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc 256; SI-NEXT: s_waitcnt vmcnt(0) 257; SI-NEXT: s_mov_b32 s2, 0x80000000 258; SI-NEXT: v_sub_f32_e32 v2, s2, v2 259; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| 260; SI-NEXT: v_med3_f32 v2, v2, |v3|, v4 261; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 262; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 263; SI-NEXT: s_endpgm 264; 265; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: 266; VI: ; %bb.0: 267; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 268; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 269; VI-NEXT: s_waitcnt lgkmcnt(0) 270; VI-NEXT: v_mov_b32_e32 v0, s2 271; VI-NEXT: v_mov_b32_e32 v1, s3 272; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 273; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 274; VI-NEXT: v_mov_b32_e32 v2, s4 275; VI-NEXT: v_mov_b32_e32 v3, s5 276; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 277; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 278; VI-NEXT: v_mov_b32_e32 v4, s6 279; VI-NEXT: v_mov_b32_e32 v5, s7 280; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 281; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 282; VI-NEXT: flat_load_dword v7, v[0:1] glc 283; VI-NEXT: s_waitcnt vmcnt(0) 284; VI-NEXT: flat_load_dword v2, v[2:3] glc 285; VI-NEXT: s_waitcnt vmcnt(0) 286; VI-NEXT: flat_load_dword v3, v[4:5] glc 287; VI-NEXT: s_waitcnt vmcnt(0) 288; VI-NEXT: s_mov_b32 s2, 0x80000000 289; VI-NEXT: v_mov_b32_e32 v0, s0 290; VI-NEXT: v_mov_b32_e32 v1, s1 291; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 292; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 293; VI-NEXT: v_sub_f32_e32 v4, s2, v7 294; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| 295; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 296; VI-NEXT: flat_store_dword v[0:1], v2 297; VI-NEXT: s_endpgm 298; 299; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: 300; GFX9: ; %bb.0: 301; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 302; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 303; GFX9-NEXT: s_waitcnt lgkmcnt(0) 304; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 305; GFX9-NEXT: s_waitcnt vmcnt(0) 306; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc 307; GFX9-NEXT: s_waitcnt vmcnt(0) 308; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 309; GFX9-NEXT: s_waitcnt vmcnt(0) 310; GFX9-NEXT: s_mov_b32 s2, 0x80000000 311; GFX9-NEXT: v_sub_f32_e32 v1, s2, v1 312; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| 313; GFX9-NEXT: v_med3_f32 v1, v1, |v2|, v3 314; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 315; GFX9-NEXT: s_endpgm 316; 317; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: 318; GFX10: ; %bb.0: 319; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 320; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 321; GFX10-NEXT: s_waitcnt lgkmcnt(0) 322; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 323; GFX10-NEXT: s_waitcnt vmcnt(0) 324; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc 325; GFX10-NEXT: s_waitcnt vmcnt(0) 326; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc 327; GFX10-NEXT: s_waitcnt vmcnt(0) 328; GFX10-NEXT: s_waitcnt_depctr 0xffe3 329; GFX10-NEXT: s_mov_b32 s2, 0x80000000 330; GFX10-NEXT: v_sub_f32_e32 v1, s2, v1 331; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| 332; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 333; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 334; GFX10-NEXT: s_endpgm 335 %tid = call i32 @llvm.amdgcn.workitem.id.x() 336 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 337 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 338 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 339 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 340 %a = load volatile float, float addrspace(1)* %gep0 341 %b = load volatile float, float addrspace(1)* %gep1 342 %c = load volatile float, float addrspace(1)* %gep2 343 344 %a.fneg = fsub float -0.0, %a 345 %b.fabs = call float @llvm.fabs.f32(float %b) 346 %c.fabs = call float @llvm.fabs.f32(float %c) 347 %c.fabs.fneg = fsub float -0.0, %c.fabs 348 349 %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs) 350 %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs) 351 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) 352 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 353 354 store float %med3, float addrspace(1)* %outgep 355 ret void 356} 357 358define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { 359; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: 360; SI: ; %bb.0: 361; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 362; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 363; SI-NEXT: v_mov_b32_e32 v1, 0 364; SI-NEXT: s_mov_b32 s10, 0 365; SI-NEXT: s_mov_b32 s11, 0xf000 366; SI-NEXT: s_waitcnt lgkmcnt(0) 367; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 368; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 369; SI-NEXT: s_waitcnt vmcnt(0) 370; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 371; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc 372; SI-NEXT: s_waitcnt vmcnt(0) 373; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 374; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc 375; SI-NEXT: s_waitcnt vmcnt(0) 376; SI-NEXT: s_mov_b32 s2, 0x80000000 377; SI-NEXT: v_sub_f32_e64 v2, s2, |v2| 378; SI-NEXT: v_sub_f32_e64 v3, s2, |v3| 379; SI-NEXT: v_sub_f32_e64 v4, s2, |v4| 380; SI-NEXT: v_med3_f32 v2, v2, v3, v4 381; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 382; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 383; SI-NEXT: s_endpgm 384; 385; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: 386; VI: ; %bb.0: 387; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 388; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 389; VI-NEXT: s_waitcnt lgkmcnt(0) 390; VI-NEXT: v_mov_b32_e32 v0, s2 391; VI-NEXT: v_mov_b32_e32 v1, s3 392; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 393; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 394; VI-NEXT: v_mov_b32_e32 v2, s4 395; VI-NEXT: v_mov_b32_e32 v3, s5 396; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 397; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 398; VI-NEXT: v_mov_b32_e32 v4, s6 399; VI-NEXT: v_mov_b32_e32 v5, s7 400; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 401; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 402; VI-NEXT: flat_load_dword v7, v[0:1] glc 403; VI-NEXT: s_waitcnt vmcnt(0) 404; VI-NEXT: flat_load_dword v2, v[2:3] glc 405; VI-NEXT: s_waitcnt vmcnt(0) 406; VI-NEXT: flat_load_dword v3, v[4:5] glc 407; VI-NEXT: s_waitcnt vmcnt(0) 408; VI-NEXT: s_mov_b32 s2, 0x80000000 409; VI-NEXT: v_mov_b32_e32 v0, s0 410; VI-NEXT: v_mov_b32_e32 v1, s1 411; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 412; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 413; VI-NEXT: v_sub_f32_e64 v4, s2, |v7| 414; VI-NEXT: v_sub_f32_e64 v2, s2, |v2| 415; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| 416; VI-NEXT: v_med3_f32 v2, v4, v2, v3 417; VI-NEXT: flat_store_dword v[0:1], v2 418; VI-NEXT: s_endpgm 419; 420; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: 421; GFX9: ; %bb.0: 422; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 423; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 424; GFX9-NEXT: s_waitcnt lgkmcnt(0) 425; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 426; GFX9-NEXT: s_waitcnt vmcnt(0) 427; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc 428; GFX9-NEXT: s_waitcnt vmcnt(0) 429; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 430; GFX9-NEXT: s_waitcnt vmcnt(0) 431; GFX9-NEXT: s_mov_b32 s2, 0x80000000 432; GFX9-NEXT: v_sub_f32_e64 v1, s2, |v1| 433; GFX9-NEXT: v_sub_f32_e64 v2, s2, |v2| 434; GFX9-NEXT: v_sub_f32_e64 v3, s2, |v3| 435; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 436; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 437; GFX9-NEXT: s_endpgm 438; 439; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: 440; GFX10: ; %bb.0: 441; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 442; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 443; GFX10-NEXT: s_waitcnt lgkmcnt(0) 444; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 445; GFX10-NEXT: s_waitcnt vmcnt(0) 446; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc 447; GFX10-NEXT: s_waitcnt vmcnt(0) 448; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc 449; GFX10-NEXT: s_waitcnt vmcnt(0) 450; GFX10-NEXT: s_waitcnt_depctr 0xffe3 451; GFX10-NEXT: s_mov_b32 s2, 0x80000000 452; GFX10-NEXT: v_sub_f32_e64 v1, s2, |v1| 453; GFX10-NEXT: v_sub_f32_e64 v2, s2, |v2| 454; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| 455; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 456; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 457; GFX10-NEXT: s_endpgm 458 %tid = call i32 @llvm.amdgcn.workitem.id.x() 459 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 460 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 461 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 462 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 463 %a = load volatile float, float addrspace(1)* %gep0 464 %b = load volatile float, float addrspace(1)* %gep1 465 %c = load volatile float, float addrspace(1)* %gep2 466 467 %a.fabs = call float @llvm.fabs.f32(float %a) 468 %a.fabs.fneg = fsub float -0.0, %a.fabs 469 %b.fabs = call float @llvm.fabs.f32(float %b) 470 %b.fabs.fneg = fsub float -0.0, %b.fabs 471 %c.fabs = call float @llvm.fabs.f32(float %c) 472 %c.fabs.fneg = fsub float -0.0, %c.fabs 473 474 %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) 475 %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg) 476 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg) 477 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 478 479 store float %med3, float addrspace(1)* %outgep 480 ret void 481} 482 483define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { 484; SI-LABEL: v_nnan_inputs_med3_f32_pat0: 485; SI: ; %bb.0: 486; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 487; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 488; SI-NEXT: v_mov_b32_e32 v1, 0 489; SI-NEXT: s_mov_b32 s10, 0 490; SI-NEXT: s_mov_b32 s11, 0xf000 491; SI-NEXT: s_waitcnt lgkmcnt(0) 492; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 493; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 494; SI-NEXT: s_waitcnt vmcnt(0) 495; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 496; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc 497; SI-NEXT: s_waitcnt vmcnt(0) 498; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 499; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc 500; SI-NEXT: s_waitcnt vmcnt(0) 501; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 502; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 503; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 504; SI-NEXT: v_med3_f32 v2, v2, v3, v4 505; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 506; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 507; SI-NEXT: s_endpgm 508; 509; VI-LABEL: v_nnan_inputs_med3_f32_pat0: 510; VI: ; %bb.0: 511; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 512; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 513; VI-NEXT: s_waitcnt lgkmcnt(0) 514; VI-NEXT: v_mov_b32_e32 v0, s2 515; VI-NEXT: v_mov_b32_e32 v1, s3 516; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 517; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 518; VI-NEXT: v_mov_b32_e32 v2, s4 519; VI-NEXT: v_mov_b32_e32 v3, s5 520; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 521; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 522; VI-NEXT: v_mov_b32_e32 v4, s6 523; VI-NEXT: v_mov_b32_e32 v5, s7 524; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 525; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 526; VI-NEXT: flat_load_dword v7, v[0:1] glc 527; VI-NEXT: s_waitcnt vmcnt(0) 528; VI-NEXT: flat_load_dword v2, v[2:3] glc 529; VI-NEXT: s_waitcnt vmcnt(0) 530; VI-NEXT: flat_load_dword v3, v[4:5] glc 531; VI-NEXT: s_waitcnt vmcnt(0) 532; VI-NEXT: v_mov_b32_e32 v0, s0 533; VI-NEXT: v_mov_b32_e32 v1, s1 534; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 535; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 536; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 537; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 538; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 539; VI-NEXT: v_med3_f32 v2, v4, v2, v3 540; VI-NEXT: flat_store_dword v[0:1], v2 541; VI-NEXT: s_endpgm 542; 543; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: 544; GFX9: ; %bb.0: 545; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 546; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 547; GFX9-NEXT: s_waitcnt lgkmcnt(0) 548; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 549; GFX9-NEXT: s_waitcnt vmcnt(0) 550; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc 551; GFX9-NEXT: s_waitcnt vmcnt(0) 552; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 553; GFX9-NEXT: s_waitcnt vmcnt(0) 554; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 555; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 556; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 557; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 558; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 559; GFX9-NEXT: s_endpgm 560; 561; GFX10-LABEL: v_nnan_inputs_med3_f32_pat0: 562; GFX10: ; %bb.0: 563; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 564; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 565; GFX10-NEXT: s_waitcnt lgkmcnt(0) 566; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 567; GFX10-NEXT: s_waitcnt vmcnt(0) 568; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc 569; GFX10-NEXT: s_waitcnt vmcnt(0) 570; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc 571; GFX10-NEXT: s_waitcnt vmcnt(0) 572; GFX10-NEXT: v_add_f32_e32 v1, 1.0, v1 573; GFX10-NEXT: v_add_f32_e32 v2, 2.0, v2 574; GFX10-NEXT: v_add_f32_e32 v3, 4.0, v3 575; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 576; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 577; GFX10-NEXT: s_endpgm 578 %tid = call i32 @llvm.amdgcn.workitem.id.x() 579 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 580 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 581 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 582 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 583 %a = load volatile float, float addrspace(1)* %gep0 584 %b = load volatile float, float addrspace(1)* %gep1 585 %c = load volatile float, float addrspace(1)* %gep2 586 587 %a.nnan = fadd nnan float %a, 1.0 588 %b.nnan = fadd nnan float %b, 2.0 589 %c.nnan = fadd nnan float %c, 4.0 590 591 %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan) 592 %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan) 593 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan) 594 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 595 store float %med3, float addrspace(1)* %outgep 596 ret void 597} 598 599 600; --------------------------------------------------------------------- 601; Negative patterns 602; --------------------------------------------------------------------- 603 604define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { 605; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: 606; SI: ; %bb.0: 607; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 608; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 609; SI-NEXT: v_mov_b32_e32 v1, 0 610; SI-NEXT: s_mov_b32 s10, 0 611; SI-NEXT: s_mov_b32 s11, 0xf000 612; SI-NEXT: s_waitcnt lgkmcnt(0) 613; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 614; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 615; SI-NEXT: s_waitcnt vmcnt(0) 616; SI-NEXT: s_mov_b64 s[8:9], s[4:5] 617; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc 618; SI-NEXT: s_waitcnt vmcnt(0) 619; SI-NEXT: s_mov_b32 s2, -1 620; SI-NEXT: s_mov_b32 s3, s11 621; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 622; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc 623; SI-NEXT: s_waitcnt vmcnt(0) 624; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 625; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 626; SI-NEXT: v_min_f32_e32 v5, v2, v3 627; SI-NEXT: v_max_f32_e32 v2, v2, v3 628; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 629; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 630; SI-NEXT: s_waitcnt vmcnt(0) 631; SI-NEXT: v_min_f32_e32 v2, v2, v3 632; SI-NEXT: v_max_f32_e32 v2, v5, v2 633; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 634; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 635; SI-NEXT: s_endpgm 636; 637; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: 638; VI: ; %bb.0: 639; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 640; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 641; VI-NEXT: s_waitcnt lgkmcnt(0) 642; VI-NEXT: v_mov_b32_e32 v0, s2 643; VI-NEXT: v_mov_b32_e32 v1, s3 644; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 645; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 646; VI-NEXT: v_mov_b32_e32 v2, s4 647; VI-NEXT: v_mov_b32_e32 v3, s5 648; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 649; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 650; VI-NEXT: v_mov_b32_e32 v4, s6 651; VI-NEXT: v_mov_b32_e32 v5, s7 652; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 653; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 654; VI-NEXT: flat_load_dword v7, v[0:1] glc 655; VI-NEXT: s_waitcnt vmcnt(0) 656; VI-NEXT: flat_load_dword v2, v[2:3] glc 657; VI-NEXT: s_waitcnt vmcnt(0) 658; VI-NEXT: flat_load_dword v3, v[4:5] glc 659; VI-NEXT: s_waitcnt vmcnt(0) 660; VI-NEXT: v_mov_b32_e32 v0, s0 661; VI-NEXT: v_mov_b32_e32 v1, s1 662; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 663; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 664; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 665; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 666; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 667; VI-NEXT: v_min_f32_e32 v5, v4, v2 668; VI-NEXT: v_max_f32_e32 v2, v4, v2 669; VI-NEXT: v_min_f32_e32 v2, v2, v3 670; VI-NEXT: v_max_f32_e32 v2, v5, v2 671; VI-NEXT: flat_store_dword v[0:1], v5 672; VI-NEXT: s_waitcnt vmcnt(0) 673; VI-NEXT: flat_store_dword v[0:1], v2 674; VI-NEXT: s_endpgm 675; 676; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: 677; GFX9: ; %bb.0: 678; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 679; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 680; GFX9-NEXT: s_waitcnt lgkmcnt(0) 681; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 682; GFX9-NEXT: s_waitcnt vmcnt(0) 683; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc 684; GFX9-NEXT: s_waitcnt vmcnt(0) 685; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 686; GFX9-NEXT: s_waitcnt vmcnt(0) 687; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 688; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 689; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 690; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 691; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 692; GFX9-NEXT: global_store_dword v[0:1], v4, off 693; GFX9-NEXT: s_waitcnt vmcnt(0) 694; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 695; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 696; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 697; GFX9-NEXT: s_endpgm 698; 699; GFX10-LABEL: v_test_safe_med3_f32_pat0_multi_use0: 700; GFX10: ; %bb.0: 701; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 702; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 703; GFX10-NEXT: s_waitcnt lgkmcnt(0) 704; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 705; GFX10-NEXT: s_waitcnt vmcnt(0) 706; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc 707; GFX10-NEXT: s_waitcnt vmcnt(0) 708; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc 709; GFX10-NEXT: s_waitcnt vmcnt(0) 710; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 711; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 712; GFX10-NEXT: v_max_f32_e32 v3, v3, v3 713; GFX10-NEXT: v_max_f32_e32 v4, v1, v2 714; GFX10-NEXT: v_min_f32_e32 v1, v1, v2 715; GFX10-NEXT: v_min_f32_e32 v2, v4, v3 716; GFX10-NEXT: v_max_f32_e32 v2, v1, v2 717; GFX10-NEXT: global_store_dword v[0:1], v1, off 718; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 719; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 720; GFX10-NEXT: s_endpgm 721 %tid = call i32 @llvm.amdgcn.workitem.id.x() 722 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid 723 %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid 724 %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid 725 %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid 726 %a = load volatile float, float addrspace(1)* %gep0 727 %b = load volatile float, float addrspace(1)* %gep1 728 %c = load volatile float, float addrspace(1)* %gep2 729 %tmp0 = call float @llvm.minnum.f32(float %a, float %b) 730 store volatile float %tmp0, float addrspace(1)* undef 731 %tmp1 = call float @llvm.maxnum.f32(float %a, float %b) 732 %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c) 733 %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2) 734 store float %med3, float addrspace(1)* %outgep 735 ret void 736} 737 738declare i32 @llvm.amdgcn.workitem.id.x() #0 739declare float @llvm.fabs.f32(float) #0 740declare float @llvm.minnum.f32(float, float) #0 741declare float @llvm.maxnum.f32(float, float) #0 742declare double @llvm.minnum.f64(double, double) #0 743declare double @llvm.maxnum.f64(double, double) #0 744declare half @llvm.fabs.f16(half) #0 745declare half @llvm.minnum.f16(half, half) #0 746declare half @llvm.maxnum.f16(half, half) #0 747 748attributes #0 = { nounwind readnone } 749attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } 750attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } 751