1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=CM %s 3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s 4 5define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { 6; CM-LABEL: test_umul24_i32: 7; CM: ; %bb.0: ; %entry 8; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 9; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 10; CM-NEXT: CF_END 11; CM-NEXT: PAD 12; CM-NEXT: ALU clause starting at 4: 13; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 14; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y, 15; CM-NEXT: AND_INT * T0.W, KC0[2].Z, literal.y, 16; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38) 17; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z, 18; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z, 19; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z, 20; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z, 21; 22; EG-LABEL: test_umul24_i32: 23; EG: ; %bb.0: ; %entry 24; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 25; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 26; EG-NEXT: CF_END 27; EG-NEXT: PAD 28; EG-NEXT: ALU clause starting at 4: 29; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, 30; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.x, 31; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) 32; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 33; EG-NEXT: MULLO_INT * T1.X, PS, PV.W, 34; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 35entry: 36 %0 = shl i32 %a, 8 37 %a_24 = lshr i32 %0, 8 38 %1 = shl i32 %b, 8 39 %b_24 = lshr i32 %1, 8 40 %2 = mul i32 %a_24, %b_24 41 store i32 %2, i32 addrspace(1)* %out 42 ret void 43} 44 45; The result must be sign-extended. 46define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) { 47; CM-LABEL: test_umul24_i16_sext: 48; CM: ; %bb.0: ; %entry 49; CM-NEXT: ALU 0, @10, KC0[], KC1[] 50; CM-NEXT: TEX 1 @6 51; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 52; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 53; CM-NEXT: CF_END 54; CM-NEXT: PAD 55; CM-NEXT: Fetch clause starting at 6: 56; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 57; CM-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3 58; CM-NEXT: ALU clause starting at 10: 59; CM-NEXT: MOV * T0.X, 0.0, 60; CM-NEXT: ALU clause starting at 11: 61; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X, 62; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X, 63; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X, 64; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X, 65; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x, 66; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 67; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 68; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 69; 70; EG-LABEL: test_umul24_i16_sext: 71; EG: ; %bb.0: ; %entry 72; EG-NEXT: ALU 0, @10, KC0[], KC1[] 73; EG-NEXT: TEX 1 @6 74; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 75; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 76; EG-NEXT: CF_END 77; EG-NEXT: PAD 78; EG-NEXT: Fetch clause starting at 6: 79; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 80; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3 81; EG-NEXT: ALU clause starting at 10: 82; EG-NEXT: MOV * T0.X, 0.0, 83; EG-NEXT: ALU clause starting at 11: 84; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 85; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x, 86; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 87; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 88entry: 89 %mul = mul i16 %a, %b 90 %ext = sext i16 %mul to i32 91 store i32 %ext, i32 addrspace(1)* %out 92 ret void 93} 94 95; The result must be sign-extended. 96define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) { 97; CM-LABEL: test_umul24_i8: 98; CM: ; %bb.0: ; %entry 99; CM-NEXT: ALU 0, @10, KC0[], KC1[] 100; CM-NEXT: TEX 1 @6 101; CM-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 102; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 103; CM-NEXT: CF_END 104; CM-NEXT: PAD 105; CM-NEXT: Fetch clause starting at 6: 106; CM-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3 107; CM-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3 108; CM-NEXT: ALU clause starting at 10: 109; CM-NEXT: MOV * T0.X, 0.0, 110; CM-NEXT: ALU clause starting at 11: 111; CM-NEXT: MULLO_INT T0.X, T1.X, T0.X, 112; CM-NEXT: MULLO_INT T0.Y (MASKED), T1.X, T0.X, 113; CM-NEXT: MULLO_INT T0.Z (MASKED), T1.X, T0.X, 114; CM-NEXT: MULLO_INT * T0.W (MASKED), T1.X, T0.X, 115; CM-NEXT: BFE_INT * T0.X, PV.X, 0.0, literal.x, 116; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 117; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 118; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 119; 120; EG-LABEL: test_umul24_i8: 121; EG: ; %bb.0: ; %entry 122; EG-NEXT: ALU 0, @10, KC0[], KC1[] 123; EG-NEXT: TEX 1 @6 124; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 125; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 126; EG-NEXT: CF_END 127; EG-NEXT: PAD 128; EG-NEXT: Fetch clause starting at 6: 129; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3 130; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3 131; EG-NEXT: ALU clause starting at 10: 132; EG-NEXT: MOV * T0.X, 0.0, 133; EG-NEXT: ALU clause starting at 11: 134; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 135; EG-NEXT: BFE_INT T0.X, PS, 0.0, literal.x, 136; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 137; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 138entry: 139 %mul = mul i8 %a, %b 140 %ext = sext i8 %mul to i32 141 store i32 %ext, i32 addrspace(1)* %out 142 ret void 143} 144 145define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { 146; CM-LABEL: test_umulhi24_i32_i64: 147; CM: ; %bb.0: ; %entry 148; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 149; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 150; CM-NEXT: CF_END 151; CM-NEXT: PAD 152; CM-NEXT: ALU clause starting at 4: 153; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 154; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 155; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].Z, KC0[2].W, 156; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W, 157; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W, 158; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W, 159; 160; EG-LABEL: test_umulhi24_i32_i64: 161; EG: ; %bb.0: ; %entry 162; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 163; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 164; EG-NEXT: CF_END 165; EG-NEXT: PAD 166; EG-NEXT: ALU clause starting at 4: 167; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 168; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 169; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].Z, KC0[2].W, 170entry: 171 %a.24 = and i32 %a, 16777215 172 %b.24 = and i32 %b, 16777215 173 %a.24.i64 = zext i32 %a.24 to i64 174 %b.24.i64 = zext i32 %b.24 to i64 175 %mul48 = mul i64 %a.24.i64, %b.24.i64 176 %mul48.hi = lshr i64 %mul48, 32 177 %mul24hi = trunc i64 %mul48.hi to i32 178 store i32 %mul24hi, i32 addrspace(1)* %out 179 ret void 180} 181 182define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) { 183; CM-LABEL: test_umulhi24: 184; CM: ; %bb.0: ; %entry 185; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 186; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 187; CM-NEXT: CF_END 188; CM-NEXT: PAD 189; CM-NEXT: ALU clause starting at 4: 190; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 191; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 192; CM-NEXT: MULHI_UINT24 T1.X, KC0[2].W, KC0[3].Y, 193; CM-NEXT: MULHI_UINT24 T1.Y (MASKED), KC0[2].W, KC0[3].Y, 194; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y, 195; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y, 196; 197; EG-LABEL: test_umulhi24: 198; EG: ; %bb.0: ; %entry 199; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 200; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 201; EG-NEXT: CF_END 202; EG-NEXT: PAD 203; EG-NEXT: ALU clause starting at 4: 204; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 205; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 206; EG-NEXT: MULHI_UINT24 * T1.X, KC0[2].W, KC0[3].Y, 207entry: 208 %a.24 = and i64 %a, 16777215 209 %b.24 = and i64 %b, 16777215 210 %mul48 = mul i64 %a.24, %b.24 211 %mul48.hi = lshr i64 %mul48, 32 212 %mul24.hi = trunc i64 %mul48.hi to i32 213 store i32 %mul24.hi, i32 addrspace(1)* %out 214 ret void 215} 216 217; Multiply with 24-bit inputs and 64-bit output. 218define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { 219; CM-LABEL: test_umul24_i64: 220; CM: ; %bb.0: ; %entry 221; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] 222; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X 223; CM-NEXT: CF_END 224; CM-NEXT: PAD 225; CM-NEXT: ALU clause starting at 4: 226; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 227; CM-NEXT: AND_INT * T0.Z, KC0[3].Y, literal.y, 228; CM-NEXT: 2(2.802597e-45), 16777215(2.350989e-38) 229; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.x, 230; CM-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) 231; CM-NEXT: MULLO_INT T1.X, T0.W, T0.Z, 232; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.Z, 233; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.Z, 234; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.Z, 235; CM-NEXT: MULHI_UINT24 T1.X (MASKED), KC0[2].W, KC0[3].Y, 236; CM-NEXT: MULHI_UINT24 T1.Y, KC0[2].W, KC0[3].Y, 237; CM-NEXT: MULHI_UINT24 T1.Z (MASKED), KC0[2].W, KC0[3].Y, 238; CM-NEXT: MULHI_UINT24 * T1.W (MASKED), KC0[2].W, KC0[3].Y, 239; 240; EG-LABEL: test_umul24_i64: 241; EG: ; %bb.0: ; %entry 242; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 243; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 244; EG-NEXT: CF_END 245; EG-NEXT: PAD 246; EG-NEXT: ALU clause starting at 4: 247; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x, 248; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, 249; EG-NEXT: 16777215(2.350989e-38), 0(0.000000e+00) 250; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 251; EG-NEXT: MULLO_INT * T1.X, PS, PV.W, 252; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 253; EG-NEXT: MULHI_UINT24 * T1.Y, KC0[2].W, KC0[3].Y, 254entry: 255 %tmp0 = shl i64 %a, 40 256 %a_24 = lshr i64 %tmp0, 40 257 %tmp1 = shl i64 %b, 40 258 %b_24 = lshr i64 %tmp1, 40 259 %tmp2 = mul i64 %a_24, %b_24 260 store i64 %tmp2, i64 addrspace(1)* %out 261 ret void 262} 263