1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 5 6; FIXME: 7; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 8 9define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) { 10; GFX9-LABEL: load_lds_v4i32: 11; GFX9: ; %bb.0: 12; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13; GFX9-NEXT: ds_read_b128 v[0:3], v0 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_setpc_b64 s[30:31] 16; 17; GFX7-LABEL: load_lds_v4i32: 18; GFX7: ; %bb.0: 19; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20; GFX7-NEXT: s_mov_b32 m0, -1 21; GFX7-NEXT: ds_read_b128 v[0:3], v0 22; GFX7-NEXT: s_waitcnt lgkmcnt(0) 23; GFX7-NEXT: s_setpc_b64 s[30:31] 24; 25; GFX10-LABEL: load_lds_v4i32: 26; GFX10: ; %bb.0: 27; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 29; GFX10-NEXT: ds_read_b128 v[0:3], v0 30; GFX10-NEXT: s_waitcnt lgkmcnt(0) 31; GFX10-NEXT: s_setpc_b64 s[30:31] 32 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr 33 ret <4 x i32> %load 34} 35 36define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { 37; GFX9-LABEL: load_lds_v4i32_align1: 38; GFX9: ; %bb.0: 39; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX9-NEXT: ds_read_u8 v1, v0 41; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 42; GFX9-NEXT: ds_read_u8 v4, v0 offset:2 43; GFX9-NEXT: ds_read_u8 v5, v0 offset:3 44; GFX9-NEXT: ds_read_u8 v6, v0 offset:4 45; GFX9-NEXT: ds_read_u8 v7, v0 offset:5 46; GFX9-NEXT: ds_read_u8 v8, v0 offset:6 47; GFX9-NEXT: ds_read_u8 v9, v0 offset:7 48; GFX9-NEXT: s_mov_b32 s5, 8 49; GFX9-NEXT: s_movk_i32 s4, 0xff 50; GFX9-NEXT: s_waitcnt lgkmcnt(6) 51; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 52; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 53; GFX9-NEXT: s_waitcnt lgkmcnt(5) 54; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 55; GFX9-NEXT: s_waitcnt lgkmcnt(4) 56; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 57; GFX9-NEXT: v_mov_b32_e32 v3, 0xff 58; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 59; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 60; GFX9-NEXT: v_or3_b32 v4, v1, v2, v4 61; GFX9-NEXT: s_waitcnt lgkmcnt(2) 62; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 63; GFX9-NEXT: s_waitcnt lgkmcnt(1) 64; GFX9-NEXT: v_and_b32_e32 v2, v8, v3 65; GFX9-NEXT: s_waitcnt lgkmcnt(0) 66; GFX9-NEXT: v_and_b32_e32 v5, v9, v3 67; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 68; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 69; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 70; GFX9-NEXT: v_or3_b32 v1, v1, v2, v5 71; GFX9-NEXT: ds_read_u8 v2, v0 offset:8 72; GFX9-NEXT: ds_read_u8 v6, v0 offset:9 73; GFX9-NEXT: ds_read_u8 v7, v0 offset:10 74; GFX9-NEXT: ds_read_u8 v8, v0 offset:11 75; GFX9-NEXT: ds_read_u8 v9, v0 offset:12 76; GFX9-NEXT: ds_read_u8 v10, v0 offset:13 77; GFX9-NEXT: ds_read_u8 v11, v0 offset:14 78; GFX9-NEXT: ds_read_u8 v0, v0 offset:15 79; GFX9-NEXT: v_mov_b32_e32 v5, 8 80; GFX9-NEXT: s_waitcnt lgkmcnt(6) 81; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 82; GFX9-NEXT: v_and_or_b32 v2, v2, v3, v6 83; GFX9-NEXT: s_waitcnt lgkmcnt(5) 84; GFX9-NEXT: v_and_b32_e32 v6, v7, v3 85; GFX9-NEXT: s_waitcnt lgkmcnt(4) 86; GFX9-NEXT: v_and_b32_e32 v7, v8, v3 87; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 88; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 89; GFX9-NEXT: s_waitcnt lgkmcnt(2) 90; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 91; GFX9-NEXT: s_waitcnt lgkmcnt(0) 92; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 93; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7 94; GFX9-NEXT: v_and_b32_e32 v6, v11, v3 95; GFX9-NEXT: v_and_or_b32 v5, v9, v3, v5 96; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 97; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 98; GFX9-NEXT: v_or3_b32 v3, v5, v6, v0 99; GFX9-NEXT: v_mov_b32_e32 v0, v4 100; GFX9-NEXT: s_setpc_b64 s[30:31] 101; 102; GFX7-LABEL: load_lds_v4i32_align1: 103; GFX7: ; %bb.0: 104; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; GFX7-NEXT: s_mov_b32 m0, -1 106; GFX7-NEXT: s_movk_i32 s4, 0xff 107; GFX7-NEXT: ds_read_u8 v1, v0 108; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 109; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 110; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 111; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 112; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 113; GFX7-NEXT: ds_read_u8 v8, v0 offset:6 114; GFX7-NEXT: ds_read_u8 v9, v0 offset:7 115; GFX7-NEXT: s_waitcnt lgkmcnt(6) 116; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 117; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 118; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 119; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 120; GFX7-NEXT: s_waitcnt lgkmcnt(5) 121; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 122; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 123; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 124; GFX7-NEXT: s_waitcnt lgkmcnt(4) 125; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 126; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 127; GFX7-NEXT: v_mov_b32_e32 v3, 0xff 128; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 129; GFX7-NEXT: s_waitcnt lgkmcnt(2) 130; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 131; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 132; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 133; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 134; GFX7-NEXT: s_waitcnt lgkmcnt(1) 135; GFX7-NEXT: v_and_b32_e32 v2, v8, v3 136; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 137; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 138; GFX7-NEXT: s_waitcnt lgkmcnt(0) 139; GFX7-NEXT: v_and_b32_e32 v2, v9, v3 140; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 141; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 142; GFX7-NEXT: ds_read_u8 v2, v0 offset:8 143; GFX7-NEXT: ds_read_u8 v5, v0 offset:9 144; GFX7-NEXT: ds_read_u8 v6, v0 offset:10 145; GFX7-NEXT: ds_read_u8 v7, v0 offset:11 146; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 147; GFX7-NEXT: ds_read_u8 v9, v0 offset:13 148; GFX7-NEXT: ds_read_u8 v10, v0 offset:14 149; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 150; GFX7-NEXT: s_waitcnt lgkmcnt(6) 151; GFX7-NEXT: v_and_b32_e32 v5, v5, v3 152; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 153; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 154; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 155; GFX7-NEXT: s_waitcnt lgkmcnt(5) 156; GFX7-NEXT: v_and_b32_e32 v5, v6, v3 157; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 158; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 159; GFX7-NEXT: s_waitcnt lgkmcnt(4) 160; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 161; GFX7-NEXT: s_waitcnt lgkmcnt(2) 162; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 163; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 164; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 165; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 166; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 167; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 168; GFX7-NEXT: s_waitcnt lgkmcnt(1) 169; GFX7-NEXT: v_and_b32_e32 v6, v10, v3 170; GFX7-NEXT: s_waitcnt lgkmcnt(0) 171; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 172; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 173; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 174; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 175; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 176; GFX7-NEXT: v_mov_b32_e32 v0, v4 177; GFX7-NEXT: s_setpc_b64 s[30:31] 178; 179; GFX10-LABEL: load_lds_v4i32_align1: 180; GFX10: ; %bb.0: 181; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 182; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 183; GFX10-NEXT: ds_read_u8 v1, v0 offset:1 184; GFX10-NEXT: ds_read_u8 v2, v0 offset:2 185; GFX10-NEXT: ds_read_u8 v3, v0 offset:3 186; GFX10-NEXT: ds_read_u8 v4, v0 offset:5 187; GFX10-NEXT: ds_read_u8 v5, v0 offset:6 188; GFX10-NEXT: ds_read_u8 v6, v0 offset:7 189; GFX10-NEXT: ds_read_u8 v7, v0 offset:9 190; GFX10-NEXT: ds_read_u8 v8, v0 191; GFX10-NEXT: ds_read_u8 v9, v0 offset:4 192; GFX10-NEXT: ds_read_u8 v10, v0 offset:8 193; GFX10-NEXT: ds_read_u8 v12, v0 offset:10 194; GFX10-NEXT: ds_read_u8 v13, v0 offset:11 195; GFX10-NEXT: ds_read_u8 v14, v0 offset:12 196; GFX10-NEXT: ds_read_u8 v15, v0 offset:13 197; GFX10-NEXT: ds_read_u8 v16, v0 offset:14 198; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 199; GFX10-NEXT: v_mov_b32_e32 v17, 8 200; GFX10-NEXT: s_mov_b32 s5, 8 201; GFX10-NEXT: v_mov_b32_e32 v11, 0xff 202; GFX10-NEXT: s_movk_i32 s4, 0xff 203; GFX10-NEXT: s_waitcnt lgkmcnt(15) 204; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 205; GFX10-NEXT: s_waitcnt lgkmcnt(14) 206; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 207; GFX10-NEXT: s_waitcnt lgkmcnt(13) 208; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 209; GFX10-NEXT: s_waitcnt lgkmcnt(12) 210; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 211; GFX10-NEXT: s_waitcnt lgkmcnt(11) 212; GFX10-NEXT: v_and_b32_e32 v5, v5, v11 213; GFX10-NEXT: s_waitcnt lgkmcnt(10) 214; GFX10-NEXT: v_and_b32_e32 v6, v6, v11 215; GFX10-NEXT: s_waitcnt lgkmcnt(9) 216; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 217; GFX10-NEXT: s_waitcnt lgkmcnt(8) 218; GFX10-NEXT: v_and_or_b32 v1, v8, s4, v1 219; GFX10-NEXT: s_waitcnt lgkmcnt(5) 220; GFX10-NEXT: v_and_b32_e32 v8, v12, v11 221; GFX10-NEXT: v_and_or_b32 v4, v9, s4, v4 222; GFX10-NEXT: s_waitcnt lgkmcnt(4) 223; GFX10-NEXT: v_and_b32_e32 v9, v13, v11 224; GFX10-NEXT: v_and_or_b32 v7, v10, v11, v7 225; GFX10-NEXT: s_waitcnt lgkmcnt(2) 226; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v17, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 227; GFX10-NEXT: s_waitcnt lgkmcnt(1) 228; GFX10-NEXT: v_and_b32_e32 v12, v16, v11 229; GFX10-NEXT: s_waitcnt lgkmcnt(0) 230; GFX10-NEXT: v_and_b32_e32 v0, v0, v11 231; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 232; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 233; GFX10-NEXT: v_and_or_b32 v10, v14, v11, v10 234; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v12 235; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v0 236; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 237; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 238; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 239; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 240; GFX10-NEXT: v_or3_b32 v0, v1, v2, v3 241; GFX10-NEXT: v_or3_b32 v3, v10, v11, v12 242; GFX10-NEXT: v_or3_b32 v1, v4, v5, v6 243; GFX10-NEXT: v_or3_b32 v2, v7, v8, v9 244; GFX10-NEXT: s_setpc_b64 s[30:31] 245 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 246 ret <4 x i32> %load 247} 248 249define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) { 250; GFX9-LABEL: load_lds_v4i32_align2: 251; GFX9: ; %bb.0: 252; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 253; GFX9-NEXT: s_mov_b32 s4, 0xffff 254; GFX9-NEXT: ds_read_u16 v1, v0 255; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 256; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 257; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 258; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 259; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 260; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 261; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 262; GFX9-NEXT: s_waitcnt lgkmcnt(6) 263; GFX9-NEXT: v_and_b32_e32 v0, s4, v2 264; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 265; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0 266; GFX9-NEXT: s_waitcnt lgkmcnt(4) 267; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 268; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 269; GFX9-NEXT: s_waitcnt lgkmcnt(2) 270; GFX9-NEXT: v_and_b32_e32 v2, s4, v6 271; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 272; GFX9-NEXT: s_waitcnt lgkmcnt(0) 273; GFX9-NEXT: v_and_b32_e32 v3, s4, v8 274; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 275; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 276; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2 277; GFX9-NEXT: v_and_or_b32 v3, v7, s4, v3 278; GFX9-NEXT: s_setpc_b64 s[30:31] 279; 280; GFX7-LABEL: load_lds_v4i32_align2: 281; GFX7: ; %bb.0: 282; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 283; GFX7-NEXT: s_mov_b32 m0, -1 284; GFX7-NEXT: ds_read_u16 v1, v0 285; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 286; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 287; GFX7-NEXT: ds_read_u16 v4, v0 offset:6 288; GFX7-NEXT: ds_read_u16 v5, v0 offset:8 289; GFX7-NEXT: ds_read_u16 v6, v0 offset:10 290; GFX7-NEXT: ds_read_u16 v7, v0 offset:12 291; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 292; GFX7-NEXT: s_mov_b32 s4, 0xffff 293; GFX7-NEXT: s_waitcnt lgkmcnt(7) 294; GFX7-NEXT: v_and_b32_e32 v0, s4, v1 295; GFX7-NEXT: s_waitcnt lgkmcnt(6) 296; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 297; GFX7-NEXT: s_waitcnt lgkmcnt(4) 298; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 299; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 300; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 301; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 302; GFX7-NEXT: s_waitcnt lgkmcnt(2) 303; GFX7-NEXT: v_and_b32_e32 v3, s4, v6 304; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 305; GFX7-NEXT: s_waitcnt lgkmcnt(0) 306; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v8 307; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 308; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 309; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 310; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 311; GFX7-NEXT: v_and_b32_e32 v3, s4, v7 312; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 313; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 314; GFX7-NEXT: s_setpc_b64 s[30:31] 315; 316; GFX10-LABEL: load_lds_v4i32_align2: 317; GFX10: ; %bb.0: 318; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 319; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 320; GFX10-NEXT: ds_read_u16 v1, v0 offset:2 321; GFX10-NEXT: ds_read_u16 v2, v0 offset:6 322; GFX10-NEXT: ds_read_u16 v3, v0 offset:10 323; GFX10-NEXT: ds_read_u16 v4, v0 offset:14 324; GFX10-NEXT: ds_read_u16 v5, v0 325; GFX10-NEXT: ds_read_u16 v6, v0 offset:4 326; GFX10-NEXT: ds_read_u16 v7, v0 offset:8 327; GFX10-NEXT: ds_read_u16 v8, v0 offset:12 328; GFX10-NEXT: s_mov_b32 s4, 0xffff 329; GFX10-NEXT: s_waitcnt lgkmcnt(7) 330; GFX10-NEXT: v_and_b32_e32 v0, s4, v1 331; GFX10-NEXT: s_waitcnt lgkmcnt(6) 332; GFX10-NEXT: v_and_b32_e32 v1, s4, v2 333; GFX10-NEXT: s_waitcnt lgkmcnt(5) 334; GFX10-NEXT: v_and_b32_e32 v2, s4, v3 335; GFX10-NEXT: s_waitcnt lgkmcnt(4) 336; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 337; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 338; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 339; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 340; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 341; GFX10-NEXT: s_waitcnt lgkmcnt(3) 342; GFX10-NEXT: v_and_or_b32 v0, v5, s4, v0 343; GFX10-NEXT: s_waitcnt lgkmcnt(2) 344; GFX10-NEXT: v_and_or_b32 v1, v6, s4, v1 345; GFX10-NEXT: s_waitcnt lgkmcnt(1) 346; GFX10-NEXT: v_and_or_b32 v2, v7, s4, v2 347; GFX10-NEXT: s_waitcnt lgkmcnt(0) 348; GFX10-NEXT: v_and_or_b32 v3, v8, s4, v3 349; GFX10-NEXT: s_setpc_b64 s[30:31] 350 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 351 ret <4 x i32> %load 352} 353 354define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) { 355; GFX9-LABEL: load_lds_v4i32_align4: 356; GFX9: ; %bb.0: 357; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 358; GFX9-NEXT: v_mov_b32_e32 v2, v0 359; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 360; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 361; GFX9-NEXT: s_waitcnt lgkmcnt(0) 362; GFX9-NEXT: s_setpc_b64 s[30:31] 363; 364; GFX7-LABEL: load_lds_v4i32_align4: 365; GFX7: ; %bb.0: 366; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 367; GFX7-NEXT: v_mov_b32_e32 v2, v0 368; GFX7-NEXT: s_mov_b32 m0, -1 369; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 370; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 371; GFX7-NEXT: s_waitcnt lgkmcnt(0) 372; GFX7-NEXT: s_setpc_b64 s[30:31] 373; 374; GFX10-LABEL: load_lds_v4i32_align4: 375; GFX10: ; %bb.0: 376; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 378; GFX10-NEXT: v_mov_b32_e32 v2, v0 379; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 380; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 381; GFX10-NEXT: s_waitcnt lgkmcnt(0) 382; GFX10-NEXT: s_setpc_b64 s[30:31] 383 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 384 ret <4 x i32> %load 385} 386 387define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) { 388; GFX9-LABEL: load_lds_v4i32_align8: 389; GFX9: ; %bb.0: 390; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 391; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 393; GFX9-NEXT: s_setpc_b64 s[30:31] 394; 395; GFX7-LABEL: load_lds_v4i32_align8: 396; GFX7: ; %bb.0: 397; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 398; GFX7-NEXT: s_mov_b32 m0, -1 399; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 400; GFX7-NEXT: s_waitcnt lgkmcnt(0) 401; GFX7-NEXT: s_setpc_b64 s[30:31] 402; 403; GFX10-LABEL: load_lds_v4i32_align8: 404; GFX10: ; %bb.0: 405; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 406; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 407; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 408; GFX10-NEXT: s_waitcnt lgkmcnt(0) 409; GFX10-NEXT: s_setpc_b64 s[30:31] 410 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 411 ret <4 x i32> %load 412} 413 414define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) { 415; GFX9-LABEL: load_lds_v4i32_align16: 416; GFX9: ; %bb.0: 417; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 418; GFX9-NEXT: ds_read_b128 v[0:3], v0 419; GFX9-NEXT: s_waitcnt lgkmcnt(0) 420; GFX9-NEXT: s_setpc_b64 s[30:31] 421; 422; GFX7-LABEL: load_lds_v4i32_align16: 423; GFX7: ; %bb.0: 424; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 425; GFX7-NEXT: s_mov_b32 m0, -1 426; GFX7-NEXT: ds_read_b128 v[0:3], v0 427; GFX7-NEXT: s_waitcnt lgkmcnt(0) 428; GFX7-NEXT: s_setpc_b64 s[30:31] 429; 430; GFX10-LABEL: load_lds_v4i32_align16: 431; GFX10: ; %bb.0: 432; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 433; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 434; GFX10-NEXT: ds_read_b128 v[0:3], v0 435; GFX10-NEXT: s_waitcnt lgkmcnt(0) 436; GFX10-NEXT: s_setpc_b64 s[30:31] 437 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 438 ret <4 x i32> %load 439} 440