1; RUN: opt -arm-parallel-dsp -dce -mtriple=armv7-a -S %s -o - | FileCheck %s 2 3; CHECK-LABEL: single_block 4; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 5; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] 6; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 7; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] 8; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) 9define i32 @single_block(i16* %a, i16* %b, i32 %acc) { 10entry: 11 %ld.a.0 = load i16, i16* %a 12 %sext.a.0 = sext i16 %ld.a.0 to i32 13 %ld.b.0 = load i16, i16* %b 14 %sext.b.0 = sext i16 %ld.b.0 to i32 15 %mul.0 = mul i32 %sext.a.0, %sext.b.0 16 %addr.a.1 = getelementptr i16, i16* %a, i32 1 17 %addr.b.1 = getelementptr i16, i16* %b, i32 1 18 %ld.a.1 = load i16, i16* %addr.a.1 19 %sext.a.1 = sext i16 %ld.a.1 to i32 20 %ld.b.1 = load i16, i16* %addr.b.1 21 %sext.b.1 = sext i16 %ld.b.1 to i32 22 %mul.1 = mul i32 %sext.a.1, %sext.b.1 23 %add = add i32 %mul.0, %mul.1 24 %res = add i32 %add, %acc 25 ret i32 %res 26} 27 28; CHECK-LABEL: single_block_64 29; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 30; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] 31; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 32; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] 33; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 %acc) 34define i64 @single_block_64(i16* %a, i16* %b, i64 %acc) { 35entry: 36 %ld.a.0 = load i16, i16* %a 37 %sext.a.0 = sext i16 %ld.a.0 to i32 38 %ld.b.0 = load i16, i16* %b 39 %sext.b.0 = sext i16 %ld.b.0 to i32 40 %mul.0 = mul i32 %sext.a.0, %sext.b.0 41 %addr.a.1 = getelementptr i16, i16* %a, i32 1 42 %addr.b.1 = getelementptr i16, i16* %b, i32 1 43 %ld.a.1 = load i16, i16* %addr.a.1 44 %sext.a.1 = sext i16 %ld.a.1 to i32 45 %ld.b.1 = load i16, i16* %addr.b.1 46 %sext.b.1 = sext i16 %ld.b.1 to i32 47 %mul.1 = mul i32 %sext.a.1, %sext.b.1 48 %sext.mul.0 = sext i32 %mul.0 to i64 49 %sext.mul.1 = sext i32 %mul.1 to i64 50 %add = add i64 %sext.mul.0, %sext.mul.1 51 %res = add i64 %add, %acc 52 ret i64 %res 53} 54 55; CHECK-LABEL: multi_block 56; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 57; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] 58; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 59; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] 60; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) 61define i32 @multi_block(i16* %a, i16* %b, i32 %acc) { 62entry: 63 %ld.a.0 = load i16, i16* %a 64 %sext.a.0 = sext i16 %ld.a.0 to i32 65 %ld.b.0 = load i16, i16* %b 66 %sext.b.0 = sext i16 %ld.b.0 to i32 67 %mul.0 = mul i32 %sext.a.0, %sext.b.0 68 %addr.a.1 = getelementptr i16, i16* %a, i32 1 69 %addr.b.1 = getelementptr i16, i16* %b, i32 1 70 %ld.a.1 = load i16, i16* %addr.a.1 71 %sext.a.1 = sext i16 %ld.a.1 to i32 72 %ld.b.1 = load i16, i16* %addr.b.1 73 %sext.b.1 = sext i16 %ld.b.1 to i32 74 %mul.1 = mul i32 %sext.a.1, %sext.b.1 75 %add = add i32 %mul.0, %mul.1 76 br label %bb.1 77 78bb.1: 79 %res = add i32 %add, %acc 80 ret i32 %res 81} 82 83; CHECK-LABEL: multi_block_64 84; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 85; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] 86; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 87; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] 88; CHECK: call i64 @llvm.arm.smlald(i32 [[A]], i32 [[B]], i64 0) 89define i64 @multi_block_64(i16* %a, i16* %b, i64 %acc) { 90entry: 91 %ld.a.0 = load i16, i16* %a 92 %sext.a.0 = sext i16 %ld.a.0 to i32 93 %ld.b.0 = load i16, i16* %b 94 %sext.b.0 = sext i16 %ld.b.0 to i32 95 %mul.0 = mul i32 %sext.a.0, %sext.b.0 96 %addr.a.1 = getelementptr i16, i16* %a, i32 1 97 %addr.b.1 = getelementptr i16, i16* %b, i32 1 98 %ld.a.1 = load i16, i16* %addr.a.1 99 %sext.a.1 = sext i16 %ld.a.1 to i32 100 %ld.b.1 = load i16, i16* %addr.b.1 101 %sext.b.1 = sext i16 %ld.b.1 to i32 102 %mul.1 = mul i32 %sext.a.1, %sext.b.1 103 %sext.mul.0 = sext i32 %mul.0 to i64 104 %sext.mul.1 = sext i32 %mul.1 to i64 105 %add = add i64 %sext.mul.0, %sext.mul.1 106 br label %bb.1 107 108bb.1: 109 %res = add i64 %add, %acc 110 ret i64 %res 111} 112 113; CHECK-LABEL: multi_block_1 114; CHECK-NOT: call i32 @llvm.arm.smlad 115define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) { 116entry: 117 %ld.a.0 = load i16, i16* %a 118 %sext.a.0 = sext i16 %ld.a.0 to i32 119 %ld.b.0 = load i16, i16* %b 120 %sext.b.0 = sext i16 %ld.b.0 to i32 121 %mul.0 = mul i32 %sext.a.0, %sext.b.0 122 br label %bb.1 123 124bb.1: 125 %addr.a.1 = getelementptr i16, i16* %a, i32 1 126 %addr.b.1 = getelementptr i16, i16* %b, i32 1 127 %ld.a.1 = load i16, i16* %addr.a.1 128 %sext.a.1 = sext i16 %ld.a.1 to i32 129 %ld.b.1 = load i16, i16* %addr.b.1 130 %sext.b.1 = sext i16 %ld.b.1 to i32 131 %mul.1 = mul i32 %sext.a.1, %sext.b.1 132 %add = add i32 %mul.0, %mul.1 133 %res = add i32 %add, %acc 134 ret i32 %res 135} 136 137; TODO: Four smlads should be generated here, but mul.0 and mul.3 remain as 138; scalars. 139; CHECK-LABEL: num_load_limit 140; CHECK: call i32 @llvm.arm.smlad 141; CHECK: call i32 @llvm.arm.smlad 142; CHECK: call i32 @llvm.arm.smlad 143; CHECK-NOT: call i32 @llvm.arm.smlad 144define i32 @num_load_limit(i16* %a, i16* %b, i32 %acc) { 145entry: 146 %ld.a.0 = load i16, i16* %a 147 %sext.a.0 = sext i16 %ld.a.0 to i32 148 %ld.b.0 = load i16, i16* %b 149 %sext.b.0 = sext i16 %ld.b.0 to i32 150 %mul.0 = mul i32 %sext.a.0, %sext.b.0 151 %addr.a.1 = getelementptr i16, i16* %a, i32 1 152 %addr.b.1 = getelementptr i16, i16* %b, i32 1 153 %ld.a.1 = load i16, i16* %addr.a.1 154 %sext.a.1 = sext i16 %ld.a.1 to i32 155 %ld.b.1 = load i16, i16* %addr.b.1 156 %sext.b.1 = sext i16 %ld.b.1 to i32 157 %mul.1 = mul i32 %sext.a.1, %sext.b.1 158 %add.0 = add i32 %mul.0, %mul.1 159 160 %addr.a.2 = getelementptr i16, i16* %a, i32 2 161 %addr.b.2 = getelementptr i16, i16* %b, i32 2 162 %ld.a.2 = load i16, i16* %addr.a.2 163 %sext.a.2 = sext i16 %ld.a.2 to i32 164 %ld.b.2 = load i16, i16* %addr.b.2 165 %sext.b.2 = sext i16 %ld.b.2 to i32 166 %mul.2 = mul i32 %sext.a.0, %sext.b.0 167 %addr.a.3 = getelementptr i16, i16* %a, i32 3 168 %addr.b.3 = getelementptr i16, i16* %b, i32 3 169 %ld.a.3 = load i16, i16* %addr.a.3 170 %sext.a.3 = sext i16 %ld.a.3 to i32 171 %ld.b.3 = load i16, i16* %addr.b.3 172 %sext.b.3 = sext i16 %ld.b.3 to i32 173 %mul.3 = mul i32 %sext.a.1, %sext.b.3 174 %add.3 = add i32 %mul.2, %mul.3 175 176 %addr.a.4 = getelementptr i16, i16* %a, i32 4 177 %addr.b.4 = getelementptr i16, i16* %b, i32 4 178 %ld.a.4 = load i16, i16* %addr.a.4 179 %sext.a.4 = sext i16 %ld.a.4 to i32 180 %ld.b.4 = load i16, i16* %addr.b.4 181 %sext.b.4 = sext i16 %ld.b.4 to i32 182 %mul.4 = mul i32 %sext.a.4, %sext.b.4 183 %addr.a.5 = getelementptr i16, i16* %a, i32 5 184 %addr.b.5 = getelementptr i16, i16* %b, i32 5 185 %ld.a.5 = load i16, i16* %addr.a.5 186 %sext.a.5 = sext i16 %ld.a.5 to i32 187 %ld.b.5 = load i16, i16* %addr.b.5 188 %sext.b.5 = sext i16 %ld.b.5 to i32 189 %mul.5 = mul i32 %sext.a.5, %sext.b.5 190 %add.5 = add i32 %mul.4, %mul.5 191 192 %addr.a.6 = getelementptr i16, i16* %a, i32 6 193 %addr.b.6 = getelementptr i16, i16* %b, i32 6 194 %ld.a.6 = load i16, i16* %addr.a.6 195 %sext.a.6 = sext i16 %ld.a.6 to i32 196 %ld.b.6 = load i16, i16* %addr.b.6 197 %sext.b.6 = sext i16 %ld.b.6 to i32 198 %mul.6 = mul i32 %sext.a.6, %sext.b.6 199 %addr.a.7 = getelementptr i16, i16* %a, i32 7 200 %addr.b.7 = getelementptr i16, i16* %b, i32 7 201 %ld.a.7 = load i16, i16* %addr.a.7 202 %sext.a.7 = sext i16 %ld.a.7 to i32 203 %ld.b.7 = load i16, i16* %addr.b.7 204 %sext.b.7 = sext i16 %ld.b.7 to i32 205 %mul.7 = mul i32 %sext.a.7, %sext.b.7 206 %add.7 = add i32 %mul.6, %mul.7 207 208 %add.10 = add i32 %add.7, %add.5 209 %add.11 = add i32 %add.3, %add.0 210 %add.12 = add i32 %add.10, %add.11 211 %res = add i32 %add.12, %acc 212 ret i32 %res 213} 214 215; CHECK-LABEL: too_many_loads 216; CHECK-NOT: call i32 @llvm.arm.smlad 217define i32 @too_many_loads(i16* %a, i16* %b, i32 %acc) { 218entry: 219 %ld.a.0 = load i16, i16* %a 220 %sext.a.0 = sext i16 %ld.a.0 to i32 221 %ld.b.0 = load i16, i16* %b 222 %sext.b.0 = sext i16 %ld.b.0 to i32 223 %mul.0 = mul i32 %sext.a.0, %sext.b.0 224 %addr.a.1 = getelementptr i16, i16* %a, i32 1 225 %addr.b.1 = getelementptr i16, i16* %b, i32 1 226 %ld.a.1 = load i16, i16* %addr.a.1 227 %sext.a.1 = sext i16 %ld.a.1 to i32 228 %ld.b.1 = load i16, i16* %addr.b.1 229 %sext.b.1 = sext i16 %ld.b.1 to i32 230 %mul.1 = mul i32 %sext.a.1, %sext.b.1 231 %add.0 = add i32 %mul.0, %mul.1 232 233 %addr.a.2 = getelementptr i16, i16* %a, i32 2 234 %addr.b.2 = getelementptr i16, i16* %b, i32 2 235 %ld.a.2 = load i16, i16* %addr.a.2 236 %sext.a.2 = sext i16 %ld.a.2 to i32 237 %ld.b.2 = load i16, i16* %addr.b.2 238 %sext.b.2 = sext i16 %ld.b.2 to i32 239 %mul.2 = mul i32 %sext.a.0, %sext.b.0 240 %addr.a.3 = getelementptr i16, i16* %a, i32 3 241 %addr.b.3 = getelementptr i16, i16* %b, i32 3 242 %ld.a.3 = load i16, i16* %addr.a.3 243 %sext.a.3 = sext i16 %ld.a.3 to i32 244 %ld.b.3 = load i16, i16* %addr.b.3 245 %sext.b.3 = sext i16 %ld.b.3 to i32 246 %mul.3 = mul i32 %sext.a.1, %sext.b.3 247 %add.3 = add i32 %mul.2, %mul.3 248 249 %addr.a.4 = getelementptr i16, i16* %a, i32 4 250 %addr.b.4 = getelementptr i16, i16* %b, i32 4 251 %ld.a.4 = load i16, i16* %addr.a.4 252 %sext.a.4 = sext i16 %ld.a.4 to i32 253 %ld.b.4 = load i16, i16* %addr.b.4 254 %sext.b.4 = sext i16 %ld.b.4 to i32 255 %mul.4 = mul i32 %sext.a.4, %sext.b.4 256 %addr.a.5 = getelementptr i16, i16* %a, i32 5 257 %addr.b.5 = getelementptr i16, i16* %b, i32 5 258 %ld.a.5 = load i16, i16* %addr.a.5 259 %sext.a.5 = sext i16 %ld.a.5 to i32 260 %ld.b.5 = load i16, i16* %addr.b.5 261 %sext.b.5 = sext i16 %ld.b.5 to i32 262 %mul.5 = mul i32 %sext.a.5, %sext.b.5 263 %add.5 = add i32 %mul.4, %mul.5 264 265 %addr.a.6 = getelementptr i16, i16* %a, i32 6 266 %addr.b.6 = getelementptr i16, i16* %b, i32 6 267 %ld.a.6 = load i16, i16* %addr.a.6 268 %sext.a.6 = sext i16 %ld.a.6 to i32 269 %ld.b.6 = load i16, i16* %addr.b.6 270 %sext.b.6 = sext i16 %ld.b.6 to i32 271 %mul.6 = mul i32 %sext.a.6, %sext.b.6 272 %addr.a.7 = getelementptr i16, i16* %a, i32 7 273 %addr.b.7 = getelementptr i16, i16* %b, i32 7 274 %ld.a.7 = load i16, i16* %addr.a.7 275 %sext.a.7 = sext i16 %ld.a.7 to i32 276 %ld.b.7 = load i16, i16* %addr.b.7 277 %sext.b.7 = sext i16 %ld.b.7 to i32 278 %mul.7 = mul i32 %sext.a.7, %sext.b.7 279 %add.7 = add i32 %mul.6, %mul.7 280 281 %addr.a.8 = getelementptr i16, i16* %a, i32 7 282 %addr.b.8 = getelementptr i16, i16* %b, i32 7 283 %ld.a.8 = load i16, i16* %addr.a.8 284 %sext.a.8 = sext i16 %ld.a.8 to i32 285 %ld.b.8 = load i16, i16* %addr.b.8 286 %sext.b.8 = sext i16 %ld.b.8 to i32 287 %mul.8 = mul i32 %sext.a.8, %sext.b.8 288 289 %add.10 = add i32 %add.7, %add.5 290 %add.11 = add i32 %add.3, %add.0 291 %add.12 = add i32 %add.10, %add.11 292 %add.13 = add i32 %add.12, %acc 293 %res = add i32 %add.13, %mul.8 294 ret i32 %res 295} 296