1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/GlobalISel/Utils.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetOpcodes.h"
24 #include "llvm/CodeGen/ValueTypes.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/IntrinsicsAArch64.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/Support/MathExtras.h"
29 #include <initializer_list>
30
31 #define DEBUG_TYPE "aarch64-legalinfo"
32
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37
AArch64LegalizerInfo(const AArch64Subtarget & ST)38 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
39 : ST(&ST) {
40 using namespace TargetOpcode;
41 const LLT p0 = LLT::pointer(0, 64);
42 const LLT s1 = LLT::scalar(1);
43 const LLT s8 = LLT::scalar(8);
44 const LLT s16 = LLT::scalar(16);
45 const LLT s32 = LLT::scalar(32);
46 const LLT s64 = LLT::scalar(64);
47 const LLT s128 = LLT::scalar(128);
48 const LLT s256 = LLT::scalar(256);
49 const LLT v16s8 = LLT::fixed_vector(16, 8);
50 const LLT v8s8 = LLT::fixed_vector(8, 8);
51 const LLT v4s8 = LLT::fixed_vector(4, 8);
52 const LLT v8s16 = LLT::fixed_vector(8, 16);
53 const LLT v4s16 = LLT::fixed_vector(4, 16);
54 const LLT v2s16 = LLT::fixed_vector(2, 16);
55 const LLT v2s32 = LLT::fixed_vector(2, 32);
56 const LLT v4s32 = LLT::fixed_vector(4, 32);
57 const LLT v2s64 = LLT::fixed_vector(2, 64);
58 const LLT v2p0 = LLT::fixed_vector(2, p0);
59
60 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
61 v16s8, v8s16, v4s32,
62 v2s64, v2p0,
63 /* End 128bit types */
64 /* Begin 64bit types */
65 v8s8, v4s16, v2s32};
66
67 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
68
69 // FIXME: support subtargets which have neon/fp-armv8 disabled.
70 if (!ST.hasNEON() || !ST.hasFPARMv8()) {
71 getLegacyLegalizerInfo().computeTables();
72 return;
73 }
74
75 // Some instructions only support s16 if the subtarget has full 16-bit FP
76 // support.
77 const bool HasFP16 = ST.hasFullFP16();
78 const LLT &MinFPScalar = HasFP16 ? s16 : s32;
79
80 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
81 .legalFor({p0, s1, s8, s16, s32, s64})
82 .legalFor(PackedVectorAllTypeList)
83 .clampScalar(0, s8, s64)
84 .widenScalarToNextPow2(0, 8)
85 .fewerElementsIf(
86 [=](const LegalityQuery &Query) {
87 return Query.Types[0].isVector() &&
88 (Query.Types[0].getElementType() != s64 ||
89 Query.Types[0].getNumElements() != 2);
90 },
91 [=](const LegalityQuery &Query) {
92 LLT EltTy = Query.Types[0].getElementType();
93 if (EltTy == s64)
94 return std::make_pair(0, LLT::fixed_vector(2, 64));
95 return std::make_pair(0, EltTy);
96 });
97
98 getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64})
99 .legalFor(PackedVectorAllTypeList)
100 .clampScalar(0, s16, s64)
101 .widenScalarToNextPow2(0);
102
103 getActionDefinitionsBuilder(G_BSWAP)
104 .legalFor({s32, s64, v4s32, v2s32, v2s64})
105 .clampScalar(0, s32, s64)
106 .widenScalarToNextPow2(0);
107
108 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
109 .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
110 .scalarizeIf(
111 [=](const LegalityQuery &Query) {
112 return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
113 },
114 0)
115 .legalFor({v2s64})
116 .clampScalar(0, s32, s64)
117 .widenScalarToNextPow2(0)
118 .clampNumElements(0, v2s32, v4s32)
119 .clampNumElements(0, v2s64, v2s64)
120 .moreElementsToNextPow2(0);
121
122 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
123 .customIf([=](const LegalityQuery &Query) {
124 const auto &SrcTy = Query.Types[0];
125 const auto &AmtTy = Query.Types[1];
126 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
127 AmtTy.getSizeInBits() == 32;
128 })
129 .legalFor({
130 {s32, s32},
131 {s32, s64},
132 {s64, s64},
133 {v8s8, v8s8},
134 {v16s8, v16s8},
135 {v4s16, v4s16},
136 {v8s16, v8s16},
137 {v2s32, v2s32},
138 {v4s32, v4s32},
139 {v2s64, v2s64},
140 })
141 .widenScalarToNextPow2(0)
142 .clampScalar(1, s32, s64)
143 .clampScalar(0, s32, s64)
144 .clampNumElements(0, v2s32, v4s32)
145 .clampNumElements(0, v2s64, v2s64)
146 .moreElementsToNextPow2(0)
147 .minScalarSameAs(1, 0);
148
149 getActionDefinitionsBuilder(G_PTR_ADD)
150 .legalFor({{p0, s64}, {v2p0, v2s64}})
151 .clampScalar(1, s64, s64);
152
153 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
154
155 getActionDefinitionsBuilder({G_SDIV, G_UDIV})
156 .legalFor({s32, s64})
157 .libcallFor({s128})
158 .clampScalar(0, s32, s64)
159 .widenScalarToNextPow2(0)
160 .scalarize(0);
161
162 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
163 .lowerFor({s1, s8, s16, s32, s64});
164
165 getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}});
166
167 getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
168
169 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
170 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
171 .clampNumElements(0, v8s8, v16s8)
172 .clampNumElements(0, v4s16, v8s16)
173 .clampNumElements(0, v2s32, v4s32)
174 // FIXME: This sholdn't be needed as v2s64 types are going to
175 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
176 .clampNumElements(0, v2s64, v2s64)
177 .lower();
178
179 getActionDefinitionsBuilder(
180 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
181 .legalFor({{s32, s1}, {s64, s1}})
182 .clampScalar(0, s32, s64)
183 .widenScalarToNextPow2(0);
184
185 getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
186 .legalFor({s32, s64, v2s64, v4s32, v2s32})
187 .clampNumElements(0, v2s32, v4s32)
188 .clampNumElements(0, v2s64, v2s64);
189
190 getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
191
192 getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
193 G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
194 G_FNEARBYINT, G_INTRINSIC_LRINT})
195 // If we don't have full FP16 support, then scalarize the elements of
196 // vectors containing fp16 types.
197 .fewerElementsIf(
198 [=, &ST](const LegalityQuery &Query) {
199 const auto &Ty = Query.Types[0];
200 return Ty.isVector() && Ty.getElementType() == s16 &&
201 !ST.hasFullFP16();
202 },
203 [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
204 // If we don't have full FP16 support, then widen s16 to s32 if we
205 // encounter it.
206 .widenScalarIf(
207 [=, &ST](const LegalityQuery &Query) {
208 return Query.Types[0] == s16 && !ST.hasFullFP16();
209 },
210 [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
211 .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
212
213 getActionDefinitionsBuilder(
214 {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW})
215 // We need a call for these, so we always need to scalarize.
216 .scalarize(0)
217 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
218 .minScalar(0, s32)
219 .libcallFor({s32, s64, v2s32, v4s32, v2s64});
220
221 getActionDefinitionsBuilder(G_INSERT)
222 .unsupportedIf([=](const LegalityQuery &Query) {
223 return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits();
224 })
225 .legalIf([=](const LegalityQuery &Query) {
226 const LLT &Ty0 = Query.Types[0];
227 const LLT &Ty1 = Query.Types[1];
228 if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0)
229 return false;
230 return isPowerOf2_32(Ty1.getSizeInBits()) &&
231 (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8);
232 })
233 .clampScalar(0, s32, s64)
234 .widenScalarToNextPow2(0)
235 .maxScalarIf(typeInSet(0, {s32}), 1, s16)
236 .maxScalarIf(typeInSet(0, {s64}), 1, s32)
237 .widenScalarToNextPow2(1);
238
239 getActionDefinitionsBuilder(G_EXTRACT)
240 .unsupportedIf([=](const LegalityQuery &Query) {
241 return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits();
242 })
243 .legalIf([=](const LegalityQuery &Query) {
244 const LLT &Ty0 = Query.Types[0];
245 const LLT &Ty1 = Query.Types[1];
246 if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128)
247 return false;
248 if (Ty1 == p0)
249 return true;
250 return isPowerOf2_32(Ty0.getSizeInBits()) &&
251 (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
252 })
253 .clampScalar(1, s32, s128)
254 .widenScalarToNextPow2(1)
255 .maxScalarIf(typeInSet(1, {s32}), 0, s16)
256 .maxScalarIf(typeInSet(1, {s64}), 0, s32)
257 .widenScalarToNextPow2(0);
258
259 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
260 .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
261 .legalForTypesWithMemDesc({{s32, p0, s8, 8},
262 {s32, p0, s16, 8},
263 {s32, p0, s32, 8},
264 {s64, p0, s8, 2},
265 {s64, p0, s16, 2},
266 {s64, p0, s32, 4},
267 {s64, p0, s64, 8},
268 {p0, p0, s64, 8},
269 {v2s32, p0, s64, 8}})
270 .clampScalar(0, s32, s64)
271 .widenScalarToNextPow2(0)
272 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
273 // how to do that yet.
274 .unsupportedIfMemSizeNotPow2()
275 // Lower anything left over into G_*EXT and G_LOAD
276 .lower();
277
278 auto IsPtrVecPred = [=](const LegalityQuery &Query) {
279 const LLT &ValTy = Query.Types[0];
280 if (!ValTy.isVector())
281 return false;
282 const LLT EltTy = ValTy.getElementType();
283 return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
284 };
285
286 getActionDefinitionsBuilder(G_LOAD)
287 .legalForTypesWithMemDesc({{s8, p0, s8, 8},
288 {s16, p0, s16, 8},
289 {s32, p0, s32, 8},
290 {s64, p0, s64, 8},
291 {p0, p0, s64, 8},
292 {s128, p0, s128, 8},
293 {v8s8, p0, s64, 8},
294 {v16s8, p0, s128, 8},
295 {v4s16, p0, s64, 8},
296 {v8s16, p0, s128, 8},
297 {v2s32, p0, s64, 8},
298 {v4s32, p0, s128, 8},
299 {v2s64, p0, s128, 8}})
300 // These extends are also legal
301 .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
302 .clampScalar(0, s8, s64)
303 .lowerIfMemSizeNotPow2()
304 .widenScalarToNextPow2(0)
305 .narrowScalarIf([=](const LegalityQuery &Query) {
306 // Clamp extending load results to 32-bits.
307 return Query.Types[0].isScalar() &&
308 Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
309 Query.Types[0].getSizeInBits() > 32;
310 },
311 changeTo(0, s32))
312 // Lower any any-extending loads left into G_ANYEXT and G_LOAD
313 .lowerIf([=](const LegalityQuery &Query) {
314 return Query.Types[0] != Query.MMODescrs[0].MemoryTy;
315 })
316 .clampMaxNumElements(0, s8, 16)
317 .clampMaxNumElements(0, s16, 8)
318 .clampMaxNumElements(0, s32, 4)
319 .clampMaxNumElements(0, s64, 2)
320 .customIf(IsPtrVecPred)
321 .scalarizeIf(typeIs(0, v2s16), 0);
322
323 getActionDefinitionsBuilder(G_STORE)
324 .legalForTypesWithMemDesc({{s8, p0, s8, 8},
325 {s16, p0, s8, 8}, // truncstorei8 from s16
326 {s32, p0, s8, 8}, // truncstorei8 from s32
327 {s64, p0, s8, 8}, // truncstorei8 from s64
328 {s16, p0, s16, 8},
329 {s32, p0, s16, 8}, // truncstorei16 from s32
330 {s64, p0, s16, 8}, // truncstorei16 from s64
331 {s32, p0, s8, 8},
332 {s32, p0, s16, 8},
333 {s32, p0, s32, 8},
334 {s64, p0, s64, 8},
335 {s64, p0, s32, 8}, // truncstorei32 from s64
336 {p0, p0, s64, 8},
337 {s128, p0, s128, 8},
338 {v16s8, p0, s128, 8},
339 {v8s8, p0, s64, 8},
340 {v4s16, p0, s64, 8},
341 {v8s16, p0, s128, 8},
342 {v2s32, p0, s64, 8},
343 {v4s32, p0, s128, 8},
344 {v2s64, p0, s128, 8}})
345 .clampScalar(0, s8, s64)
346 .lowerIf([=](const LegalityQuery &Query) {
347 return Query.Types[0].isScalar() &&
348 Query.Types[0] != Query.MMODescrs[0].MemoryTy;
349 })
350 // Maximum: sN * k = 128
351 .clampMaxNumElements(0, s8, 16)
352 .clampMaxNumElements(0, s16, 8)
353 .clampMaxNumElements(0, s32, 4)
354 .clampMaxNumElements(0, s64, 2)
355 .lowerIfMemSizeNotPow2()
356 .customIf(IsPtrVecPred)
357 .scalarizeIf(typeIs(0, v2s16), 0);
358
359 // Constants
360 getActionDefinitionsBuilder(G_CONSTANT)
361 .legalFor({p0, s8, s16, s32, s64})
362 .clampScalar(0, s8, s64)
363 .widenScalarToNextPow2(0);
364 getActionDefinitionsBuilder(G_FCONSTANT)
365 .legalIf([=](const LegalityQuery &Query) {
366 const auto &Ty = Query.Types[0];
367 if (HasFP16 && Ty == s16)
368 return true;
369 return Ty == s32 || Ty == s64 || Ty == s128;
370 })
371 .clampScalar(0, MinFPScalar, s128);
372
373 getActionDefinitionsBuilder({G_ICMP, G_FCMP})
374 .legalFor({{s32, s32},
375 {s32, s64},
376 {s32, p0},
377 {v4s32, v4s32},
378 {v2s32, v2s32},
379 {v2s64, v2s64},
380 {v2s64, v2p0},
381 {v4s16, v4s16},
382 {v8s16, v8s16},
383 {v8s8, v8s8},
384 {v16s8, v16s8}})
385 .clampScalar(1, s32, s64)
386 .clampScalar(0, s32, s32)
387 .minScalarEltSameAsIf(
388 [=](const LegalityQuery &Query) {
389 const LLT &Ty = Query.Types[0];
390 const LLT &SrcTy = Query.Types[1];
391 return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
392 Ty.getElementType() != SrcTy.getElementType();
393 },
394 0, 1)
395 .minScalarOrEltIf(
396 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
397 1, s32)
398 .minScalarOrEltIf(
399 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
400 s64)
401 .widenScalarOrEltToNextPow2(1)
402 .clampNumElements(0, v2s32, v4s32);
403
404 // Extensions
405 auto ExtLegalFunc = [=](const LegalityQuery &Query) {
406 unsigned DstSize = Query.Types[0].getSizeInBits();
407
408 if (DstSize == 128 && !Query.Types[0].isVector())
409 return false; // Extending to a scalar s128 needs narrowing.
410
411 // Make sure that we have something that will fit in a register, and
412 // make sure it's a power of 2.
413 if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
414 return false;
415
416 const LLT &SrcTy = Query.Types[1];
417
418 // Special case for s1.
419 if (SrcTy == s1)
420 return true;
421
422 // Make sure we fit in a register otherwise. Don't bother checking that
423 // the source type is below 128 bits. We shouldn't be allowing anything
424 // through which is wider than the destination in the first place.
425 unsigned SrcSize = SrcTy.getSizeInBits();
426 if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
427 return false;
428
429 return true;
430 };
431 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
432 .legalIf(ExtLegalFunc)
433 .clampScalar(0, s64, s64); // Just for s128, others are handled above.
434
435 getActionDefinitionsBuilder(G_TRUNC)
436 .minScalarOrEltIf(
437 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
438 0, s8)
439 .customIf([=](const LegalityQuery &Query) {
440 LLT DstTy = Query.Types[0];
441 LLT SrcTy = Query.Types[1];
442 return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
443 })
444 .alwaysLegal();
445
446 getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower();
447
448 // FP conversions
449 getActionDefinitionsBuilder(G_FPTRUNC)
450 .legalFor(
451 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
452 .clampMaxNumElements(0, s32, 2);
453 getActionDefinitionsBuilder(G_FPEXT)
454 .legalFor(
455 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
456 .clampMaxNumElements(0, s64, 2);
457
458 // Conversions
459 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
460 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
461 .clampScalar(0, s32, s64)
462 .widenScalarToNextPow2(0)
463 .clampScalar(1, s32, s64)
464 .widenScalarToNextPow2(1);
465
466 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
467 .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
468 .clampScalar(1, s32, s64)
469 .minScalarSameAs(1, 0)
470 .clampScalar(0, s32, s64)
471 .widenScalarToNextPow2(0);
472
473 // Control-flow
474 getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32});
475 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
476
477 getActionDefinitionsBuilder(G_SELECT)
478 .legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
479 .clampScalar(0, s32, s64)
480 .widenScalarToNextPow2(0)
481 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
482 .lowerIf(isVector(0));
483
484 // Pointer-handling
485 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
486
487 if (TM.getCodeModel() == CodeModel::Small)
488 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
489 else
490 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
491
492 getActionDefinitionsBuilder(G_PTRTOINT)
493 .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
494 .maxScalar(0, s64)
495 .widenScalarToNextPow2(0, /*Min*/ 8);
496
497 getActionDefinitionsBuilder(G_INTTOPTR)
498 .unsupportedIf([&](const LegalityQuery &Query) {
499 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
500 })
501 .legalFor({{p0, s64}, {v2p0, v2s64}});
502
503 // Casts for 32 and 64-bit width type are just copies.
504 // Same for 128-bit width type, except they are on the FPR bank.
505 getActionDefinitionsBuilder(G_BITCAST)
506 // FIXME: This is wrong since G_BITCAST is not allowed to change the
507 // number of bits but it's what the previous code described and fixing
508 // it breaks tests.
509 .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
510 v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
511 v2p0});
512
513 getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
514
515 // va_list must be a pointer, but most sized types are pretty easy to handle
516 // as the destination.
517 getActionDefinitionsBuilder(G_VAARG)
518 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
519 .clampScalar(0, s8, s64)
520 .widenScalarToNextPow2(0, /*Min*/ 8);
521
522 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
523 .lowerIf(
524 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0)));
525
526 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
527 .customIf([](const LegalityQuery &Query) {
528 return Query.Types[0].getSizeInBits() == 128;
529 })
530 .clampScalar(0, s32, s64)
531 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
532
533 getActionDefinitionsBuilder(
534 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
535 G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
536 G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
537 .clampScalar(0, s32, s64)
538 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
539
540 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
541
542 // Merge/Unmerge
543 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
544 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
545 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
546
547 auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) {
548 const LLT &Ty = Query.Types[TypeIdx];
549 if (Ty.isVector()) {
550 const LLT &EltTy = Ty.getElementType();
551 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
552 return true;
553 if (!isPowerOf2_32(EltTy.getSizeInBits()))
554 return true;
555 }
556 return false;
557 };
558
559 // FIXME: This rule is horrible, but specifies the same as what we had
560 // before with the particularly strange definitions removed (e.g.
561 // s8 = G_MERGE_VALUES s32, s32).
562 // Part of the complexity comes from these ops being extremely flexible. For
563 // example, you can build/decompose vectors with it, concatenate vectors,
564 // etc. and in addition to this you can also bitcast with it at the same
565 // time. We've been considering breaking it up into multiple ops to make it
566 // more manageable throughout the backend.
567 getActionDefinitionsBuilder(Op)
568 // Break up vectors with weird elements into scalars
569 .fewerElementsIf(
570 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
571 scalarize(0))
572 .fewerElementsIf(
573 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
574 scalarize(1))
575 // Clamp the big scalar to s8-s128 and make it a power of 2.
576 .clampScalar(BigTyIdx, s8, s128)
577 .widenScalarIf(
578 [=](const LegalityQuery &Query) {
579 const LLT &Ty = Query.Types[BigTyIdx];
580 return !isPowerOf2_32(Ty.getSizeInBits()) &&
581 Ty.getSizeInBits() % 64 != 0;
582 },
583 [=](const LegalityQuery &Query) {
584 // Pick the next power of 2, or a multiple of 64 over 128.
585 // Whichever is smaller.
586 const LLT &Ty = Query.Types[BigTyIdx];
587 unsigned NewSizeInBits = 1
588 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
589 if (NewSizeInBits >= 256) {
590 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
591 if (RoundedTo < NewSizeInBits)
592 NewSizeInBits = RoundedTo;
593 }
594 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
595 })
596 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
597 // worth considering the multiples of 64 since 2*192 and 2*384 are not
598 // valid.
599 .clampScalar(LitTyIdx, s8, s256)
600 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8)
601 // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384,
602 // s512, <X x s8>, <X x s16>, <X x s32>, or <X x s64>.
603 // At this point it's simple enough to accept the legal types.
604 .legalIf([=](const LegalityQuery &Query) {
605 const LLT &BigTy = Query.Types[BigTyIdx];
606 const LLT &LitTy = Query.Types[LitTyIdx];
607 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
608 return false;
609 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
610 return false;
611 return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
612 })
613 // Any vectors left are the wrong size. Scalarize them.
614 .scalarize(0)
615 .scalarize(1);
616 }
617
618 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
619 .unsupportedIf([=](const LegalityQuery &Query) {
620 const LLT &EltTy = Query.Types[1].getElementType();
621 return Query.Types[0] != EltTy;
622 })
623 .minScalar(2, s64)
624 .legalIf([=](const LegalityQuery &Query) {
625 const LLT &VecTy = Query.Types[1];
626 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
627 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
628 VecTy == v16s8 || VecTy == v2s32 || VecTy == v2p0;
629 })
630 .minScalarOrEltIf(
631 [=](const LegalityQuery &Query) {
632 // We want to promote to <M x s1> to <M x s64> if that wouldn't
633 // cause the total vec size to be > 128b.
634 return Query.Types[1].getNumElements() <= 2;
635 },
636 0, s64)
637 .minScalarOrEltIf(
638 [=](const LegalityQuery &Query) {
639 return Query.Types[1].getNumElements() <= 4;
640 },
641 0, s32)
642 .minScalarOrEltIf(
643 [=](const LegalityQuery &Query) {
644 return Query.Types[1].getNumElements() <= 8;
645 },
646 0, s16)
647 .minScalarOrEltIf(
648 [=](const LegalityQuery &Query) {
649 return Query.Types[1].getNumElements() <= 16;
650 },
651 0, s8)
652 .minScalarOrElt(0, s8) // Worst case, we need at least s8.
653 .clampMaxNumElements(1, s64, 2)
654 .clampMaxNumElements(1, s32, 4)
655 .clampMaxNumElements(1, s16, 8)
656 .clampMaxNumElements(1, p0, 2);
657
658 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
659 .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
660
661 getActionDefinitionsBuilder(G_BUILD_VECTOR)
662 .legalFor({{v8s8, s8},
663 {v16s8, s8},
664 {v2s16, s16},
665 {v4s16, s16},
666 {v8s16, s16},
667 {v2s32, s32},
668 {v4s32, s32},
669 {v2p0, p0},
670 {v2s64, s64}})
671 .clampNumElements(0, v4s32, v4s32)
672 .clampNumElements(0, v2s64, v2s64)
673 .minScalarSameAs(1, 0);
674
675 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
676
677 getActionDefinitionsBuilder(G_CTLZ)
678 .legalForCartesianProduct(
679 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
680 .scalarize(1);
681 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
682
683 // TODO: Custom lowering for v2s32, v4s32, v2s64.
684 getActionDefinitionsBuilder(G_BITREVERSE).legalFor({s32, s64, v8s8, v16s8});
685
686 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
687
688 // TODO: Handle vector types.
689 getActionDefinitionsBuilder(G_CTTZ)
690 .clampScalar(0, s32, s64)
691 .scalarSameSizeAs(1, 0)
692 .customFor({s32, s64});
693
694 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
695 .legalIf([=](const LegalityQuery &Query) {
696 const LLT &DstTy = Query.Types[0];
697 const LLT &SrcTy = Query.Types[1];
698 // For now just support the TBL2 variant which needs the source vectors
699 // to be the same size as the dest.
700 if (DstTy != SrcTy)
701 return false;
702 for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) {
703 if (DstTy == Ty)
704 return true;
705 }
706 return false;
707 })
708 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
709 // just want those lowered into G_BUILD_VECTOR
710 .lowerIf([=](const LegalityQuery &Query) {
711 return !Query.Types[1].isVector();
712 })
713 .moreElementsToNextPow2(0)
714 .clampNumElements(0, v4s32, v4s32)
715 .clampNumElements(0, v2s64, v2s64);
716
717 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
718 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}});
719
720 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
721
722 getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
723 return Query.Types[0] == p0 && Query.Types[1] == s64;
724 });
725
726 getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
727
728 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
729 .libcall();
730
731 // FIXME: Legal types are only legal with NEON.
732 getActionDefinitionsBuilder(G_ABS)
733 .lowerIf(isScalar(0))
734 .legalFor(PackedVectorAllTypeList);
735
736 getActionDefinitionsBuilder(G_VECREDUCE_FADD)
737 // We only have FADDP to do reduction-like operations. Lower the rest.
738 .legalFor({{s32, v2s32}, {s64, v2s64}})
739 .clampMaxNumElements(1, s64, 2)
740 .clampMaxNumElements(1, s32, 2)
741 .lower();
742
743 getActionDefinitionsBuilder(G_VECREDUCE_ADD)
744 .legalFor(
745 {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
746 .clampMaxNumElements(1, s64, 2)
747 .clampMaxNumElements(1, s32, 4)
748 .lower();
749
750 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
751 .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
752
753 getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();
754
755 getActionDefinitionsBuilder(G_ROTR)
756 .legalFor({{s32, s64}, {s64, s64}})
757 .customIf([=](const LegalityQuery &Q) {
758 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
759 })
760 .lower();
761 getActionDefinitionsBuilder(G_ROTL).lower();
762
763 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
764 .customFor({{s32, s32}, {s64, s64}});
765
766 // TODO: Custom legalization for s128
767 // TODO: Use generic lowering when custom lowering is not possible.
768 auto always = [=](const LegalityQuery &Q) { return true; };
769 getActionDefinitionsBuilder(G_CTPOP)
770 .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
771 .clampScalar(0, s32, s128)
772 .widenScalarToNextPow2(0)
773 .minScalarEltSameAsIf(always, 1, 0)
774 .maxScalarEltSameAsIf(always, 1, 0)
775 .customFor({{s32, s32},
776 {s64, s64},
777 {v2s64, v2s64},
778 {v2s32, v2s32},
779 {v4s32, v4s32},
780 {v4s16, v4s16},
781 {v8s16, v8s16}});
782
783 getLegacyLegalizerInfo().computeTables();
784 verify(*ST.getInstrInfo());
785 }
786
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const787 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
788 MachineInstr &MI) const {
789 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
790 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
791 GISelChangeObserver &Observer = Helper.Observer;
792 switch (MI.getOpcode()) {
793 default:
794 // No idea what to do.
795 return false;
796 case TargetOpcode::G_VAARG:
797 return legalizeVaArg(MI, MRI, MIRBuilder);
798 case TargetOpcode::G_LOAD:
799 case TargetOpcode::G_STORE:
800 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
801 case TargetOpcode::G_SHL:
802 case TargetOpcode::G_ASHR:
803 case TargetOpcode::G_LSHR:
804 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
805 case TargetOpcode::G_GLOBAL_VALUE:
806 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
807 case TargetOpcode::G_TRUNC:
808 return legalizeVectorTrunc(MI, Helper);
809 case TargetOpcode::G_SBFX:
810 case TargetOpcode::G_UBFX:
811 return legalizeBitfieldExtract(MI, MRI, Helper);
812 case TargetOpcode::G_ROTR:
813 return legalizeRotate(MI, MRI, Helper);
814 case TargetOpcode::G_CTPOP:
815 return legalizeCTPOP(MI, MRI, Helper);
816 case TargetOpcode::G_ATOMIC_CMPXCHG:
817 return legalizeAtomicCmpxchg128(MI, MRI, Helper);
818 case TargetOpcode::G_CTTZ:
819 return legalizeCTTZ(MI, Helper);
820 }
821
822 llvm_unreachable("expected switch to return");
823 }
824
legalizeRotate(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const825 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
826 MachineRegisterInfo &MRI,
827 LegalizerHelper &Helper) const {
828 // To allow for imported patterns to match, we ensure that the rotate amount
829 // is 64b with an extension.
830 Register AmtReg = MI.getOperand(2).getReg();
831 LLT AmtTy = MRI.getType(AmtReg);
832 (void)AmtTy;
833 assert(AmtTy.isScalar() && "Expected a scalar rotate");
834 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
835 auto NewAmt = Helper.MIRBuilder.buildSExt(LLT::scalar(64), AmtReg);
836 Helper.Observer.changingInstr(MI);
837 MI.getOperand(2).setReg(NewAmt.getReg(0));
838 Helper.Observer.changedInstr(MI);
839 return true;
840 }
841
extractParts(Register Reg,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,LLT Ty,int NumParts,SmallVectorImpl<Register> & VRegs)842 static void extractParts(Register Reg, MachineRegisterInfo &MRI,
843 MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
844 SmallVectorImpl<Register> &VRegs) {
845 for (int I = 0; I < NumParts; ++I)
846 VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
847 MIRBuilder.buildUnmerge(VRegs, Reg);
848 }
849
legalizeVectorTrunc(MachineInstr & MI,LegalizerHelper & Helper) const850 bool AArch64LegalizerInfo::legalizeVectorTrunc(
851 MachineInstr &MI, LegalizerHelper &Helper) const {
852 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
853 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
854 // Similar to how operand splitting is done in SelectiondDAG, we can handle
855 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
856 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
857 // %lo16(<4 x s16>) = G_TRUNC %inlo
858 // %hi16(<4 x s16>) = G_TRUNC %inhi
859 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
860 // %res(<8 x s8>) = G_TRUNC %in16
861
862 Register DstReg = MI.getOperand(0).getReg();
863 Register SrcReg = MI.getOperand(1).getReg();
864 LLT DstTy = MRI.getType(DstReg);
865 LLT SrcTy = MRI.getType(SrcReg);
866 assert(isPowerOf2_32(DstTy.getSizeInBits()) &&
867 isPowerOf2_32(SrcTy.getSizeInBits()));
868
869 // Split input type.
870 LLT SplitSrcTy =
871 SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2));
872 // First, split the source into two smaller vectors.
873 SmallVector<Register, 2> SplitSrcs;
874 extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
875
876 // Truncate the splits into intermediate narrower elements.
877 LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
878 for (unsigned I = 0; I < SplitSrcs.size(); ++I)
879 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
880
881 auto Concat = MIRBuilder.buildConcatVectors(
882 DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
883
884 Helper.Observer.changingInstr(MI);
885 MI.getOperand(1).setReg(Concat.getReg(0));
886 Helper.Observer.changedInstr(MI);
887 return true;
888 }
889
legalizeSmallCMGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const890 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
891 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
892 GISelChangeObserver &Observer) const {
893 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
894 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
895 // G_ADD_LOW instructions.
896 // By splitting this here, we can optimize accesses in the small code model by
897 // folding in the G_ADD_LOW into the load/store offset.
898 auto &GlobalOp = MI.getOperand(1);
899 const auto* GV = GlobalOp.getGlobal();
900 if (GV->isThreadLocal())
901 return true; // Don't want to modify TLS vars.
902
903 auto &TM = ST->getTargetLowering()->getTargetMachine();
904 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
905
906 if (OpFlags & AArch64II::MO_GOT)
907 return true;
908
909 auto Offset = GlobalOp.getOffset();
910 Register DstReg = MI.getOperand(0).getReg();
911 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
912 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
913 // Set the regclass on the dest reg too.
914 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
915
916 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
917 // by creating a MOVK that sets bits 48-63 of the register to (global address
918 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
919 // prevent an incorrect tag being generated during relocation when the the
920 // global appears before the code section. Without the offset, a global at
921 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
922 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
923 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
924 // instead of `0xf`.
925 // This assumes that we're in the small code model so we can assume a binary
926 // size of <= 4GB, which makes the untagged PC relative offset positive. The
927 // binary must also be loaded into address range [0, 2^48). Both of these
928 // properties need to be ensured at runtime when using tagged addresses.
929 if (OpFlags & AArch64II::MO_TAGGED) {
930 assert(!Offset &&
931 "Should not have folded in an offset for a tagged global!");
932 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
933 .addGlobalAddress(GV, 0x100000000,
934 AArch64II::MO_PREL | AArch64II::MO_G3)
935 .addImm(48);
936 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
937 }
938
939 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
940 .addGlobalAddress(GV, Offset,
941 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
942 MI.eraseFromParent();
943 return true;
944 }
945
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const946 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
947 MachineInstr &MI) const {
948 return true;
949 }
950
legalizeShlAshrLshr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const951 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
952 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
953 GISelChangeObserver &Observer) const {
954 assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
955 MI.getOpcode() == TargetOpcode::G_LSHR ||
956 MI.getOpcode() == TargetOpcode::G_SHL);
957 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
958 // imported patterns can select it later. Either way, it will be legal.
959 Register AmtReg = MI.getOperand(2).getReg();
960 auto VRegAndVal = getConstantVRegValWithLookThrough(AmtReg, MRI);
961 if (!VRegAndVal)
962 return true;
963 // Check the shift amount is in range for an immediate form.
964 int64_t Amount = VRegAndVal->Value.getSExtValue();
965 if (Amount > 31)
966 return true; // This will have to remain a register variant.
967 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
968 Observer.changingInstr(MI);
969 MI.getOperand(2).setReg(ExtCst.getReg(0));
970 Observer.changedInstr(MI);
971 return true;
972 }
973
974 // FIXME: This should be removed and replaced with the generic bitcast legalize
975 // action.
legalizeLoadStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder,GISelChangeObserver & Observer) const976 bool AArch64LegalizerInfo::legalizeLoadStore(
977 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
978 GISelChangeObserver &Observer) const {
979 assert(MI.getOpcode() == TargetOpcode::G_STORE ||
980 MI.getOpcode() == TargetOpcode::G_LOAD);
981 // Here we just try to handle vector loads/stores where our value type might
982 // have pointer elements, which the SelectionDAG importer can't handle. To
983 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
984 // the value to use s64 types.
985
986 // Custom legalization requires the instruction, if not deleted, must be fully
987 // legalized. In order to allow further legalization of the inst, we create
988 // a new instruction and erase the existing one.
989
990 Register ValReg = MI.getOperand(0).getReg();
991 const LLT ValTy = MRI.getType(ValReg);
992
993 if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
994 ValTy.getElementType().getAddressSpace() != 0) {
995 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
996 return false;
997 }
998
999 unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1000 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1001 auto &MMO = **MI.memoperands_begin();
1002 MMO.setType(NewTy);
1003
1004 if (MI.getOpcode() == TargetOpcode::G_STORE) {
1005 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1006 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1007 } else {
1008 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1009 MIRBuilder.buildBitcast(ValReg, NewLoad);
1010 }
1011 MI.eraseFromParent();
1012 return true;
1013 }
1014
legalizeVaArg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & MIRBuilder) const1015 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1016 MachineRegisterInfo &MRI,
1017 MachineIRBuilder &MIRBuilder) const {
1018 MachineFunction &MF = MIRBuilder.getMF();
1019 Align Alignment(MI.getOperand(2).getImm());
1020 Register Dst = MI.getOperand(0).getReg();
1021 Register ListPtr = MI.getOperand(1).getReg();
1022
1023 LLT PtrTy = MRI.getType(ListPtr);
1024 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1025
1026 const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1027 const Align PtrAlign = Align(PtrSize);
1028 auto List = MIRBuilder.buildLoad(
1029 PtrTy, ListPtr,
1030 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1031 PtrTy, PtrAlign));
1032
1033 MachineInstrBuilder DstPtr;
1034 if (Alignment > PtrAlign) {
1035 // Realign the list to the actual required alignment.
1036 auto AlignMinus1 =
1037 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1038 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1039 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1040 } else
1041 DstPtr = List;
1042
1043 LLT ValTy = MRI.getType(Dst);
1044 uint64_t ValSize = ValTy.getSizeInBits() / 8;
1045 MIRBuilder.buildLoad(
1046 Dst, DstPtr,
1047 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1048 ValTy, std::max(Alignment, PtrAlign)));
1049
1050 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1051
1052 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1053
1054 MIRBuilder.buildStore(NewList, ListPtr,
1055 *MF.getMachineMemOperand(MachinePointerInfo(),
1056 MachineMemOperand::MOStore,
1057 PtrTy, PtrAlign));
1058
1059 MI.eraseFromParent();
1060 return true;
1061 }
1062
legalizeBitfieldExtract(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1063 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1064 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1065 // Only legal if we can select immediate forms.
1066 // TODO: Lower this otherwise.
1067 return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1068 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1069 }
1070
legalizeCTPOP(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1071 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1072 MachineRegisterInfo &MRI,
1073 LegalizerHelper &Helper) const {
1074 // While there is no integer popcount instruction, it can
1075 // be more efficiently lowered to the following sequence that uses
1076 // AdvSIMD registers/instructions as long as the copies to/from
1077 // the AdvSIMD registers are cheap.
1078 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1079 // CNT V0.8B, V0.8B // 8xbyte pop-counts
1080 // ADDV B0, V0.8B // sum 8xbyte pop-counts
1081 // UMOV X0, V0.B[0] // copy byte result back to integer reg
1082 //
1083 // For 128 bit vector popcounts, we lower to the following sequence:
1084 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
1085 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
1086 // uaddlp.4s v0, v0 // v4s32, v2s64
1087 // uaddlp.2d v0, v0 // v2s64
1088 //
1089 // For 64 bit vector popcounts, we lower to the following sequence:
1090 // cnt.8b v0, v0 // v4s16, v2s32
1091 // uaddlp.4h v0, v0 // v4s16, v2s32
1092 // uaddlp.2s v0, v0 // v2s32
1093
1094 if (!ST->hasNEON() ||
1095 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat))
1096 return false;
1097 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1098 Register Dst = MI.getOperand(0).getReg();
1099 Register Val = MI.getOperand(1).getReg();
1100 LLT Ty = MRI.getType(Val);
1101
1102 assert(Ty == MRI.getType(Dst) &&
1103 "Expected src and dst to have the same type!");
1104 unsigned Size = Ty.getSizeInBits();
1105
1106 // Pre-conditioning: widen Val up to the nearest vector type.
1107 // s32,s64,v4s16,v2s32 -> v8i8
1108 // v8s16,v4s32,v2s64 -> v16i8
1109 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1110 if (Ty.isScalar()) {
1111 // TODO: Handle s128.
1112 assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!");
1113 if (Size == 32) {
1114 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1115 }
1116 }
1117 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1118
1119 // Count bits in each byte-sized lane.
1120 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1121
1122 // Sum across lanes.
1123 Register HSum = CTPOP.getReg(0);
1124 unsigned Opc;
1125 SmallVector<LLT> HAddTys;
1126 if (Ty.isScalar()) {
1127 Opc = Intrinsic::aarch64_neon_uaddlv;
1128 HAddTys.push_back(LLT::scalar(32));
1129 } else if (Ty == LLT::fixed_vector(8, 16)) {
1130 Opc = Intrinsic::aarch64_neon_uaddlp;
1131 HAddTys.push_back(LLT::fixed_vector(8, 16));
1132 } else if (Ty == LLT::fixed_vector(4, 32)) {
1133 Opc = Intrinsic::aarch64_neon_uaddlp;
1134 HAddTys.push_back(LLT::fixed_vector(8, 16));
1135 HAddTys.push_back(LLT::fixed_vector(4, 32));
1136 } else if (Ty == LLT::fixed_vector(2, 64)) {
1137 Opc = Intrinsic::aarch64_neon_uaddlp;
1138 HAddTys.push_back(LLT::fixed_vector(8, 16));
1139 HAddTys.push_back(LLT::fixed_vector(4, 32));
1140 HAddTys.push_back(LLT::fixed_vector(2, 64));
1141 } else if (Ty == LLT::fixed_vector(4, 16)) {
1142 Opc = Intrinsic::aarch64_neon_uaddlp;
1143 HAddTys.push_back(LLT::fixed_vector(4, 16));
1144 } else if (Ty == LLT::fixed_vector(2, 32)) {
1145 Opc = Intrinsic::aarch64_neon_uaddlp;
1146 HAddTys.push_back(LLT::fixed_vector(4, 16));
1147 HAddTys.push_back(LLT::fixed_vector(2, 32));
1148 } else
1149 llvm_unreachable("unexpected vector shape");
1150 MachineInstrBuilder UADD;
1151 for (LLT HTy : HAddTys) {
1152 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false)
1153 .addUse(HSum);
1154 HSum = UADD.getReg(0);
1155 }
1156
1157 // Post-conditioning.
1158 if (Ty.isScalar() && Size == 64)
1159 MIRBuilder.buildZExt(Dst, UADD);
1160 else
1161 UADD->getOperand(0).setReg(Dst);
1162 MI.eraseFromParent();
1163 return true;
1164 }
1165
legalizeAtomicCmpxchg128(MachineInstr & MI,MachineRegisterInfo & MRI,LegalizerHelper & Helper) const1166 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
1167 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1168 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1169 LLT s64 = LLT::scalar(64);
1170 auto Addr = MI.getOperand(1).getReg();
1171 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
1172 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
1173 auto DstLo = MRI.createGenericVirtualRegister(s64);
1174 auto DstHi = MRI.createGenericVirtualRegister(s64);
1175
1176 MachineInstrBuilder CAS;
1177 if (ST->hasLSE()) {
1178 // We have 128-bit CASP instructions taking XSeqPair registers, which are
1179 // s128. We need the merge/unmerge to bracket the expansion and pair up with
1180 // the rest of the MIR so we must reassemble the extracted registers into a
1181 // 128-bit known-regclass one with code like this:
1182 //
1183 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
1184 // %out = CASP %in1, ...
1185 // %OldLo = G_EXTRACT %out, 0
1186 // %OldHi = G_EXTRACT %out, 64
1187 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1188 unsigned Opcode;
1189 switch (Ordering) {
1190 case AtomicOrdering::Acquire:
1191 Opcode = AArch64::CASPAX;
1192 break;
1193 case AtomicOrdering::Release:
1194 Opcode = AArch64::CASPLX;
1195 break;
1196 case AtomicOrdering::AcquireRelease:
1197 case AtomicOrdering::SequentiallyConsistent:
1198 Opcode = AArch64::CASPALX;
1199 break;
1200 default:
1201 Opcode = AArch64::CASPX;
1202 break;
1203 }
1204
1205 LLT s128 = LLT::scalar(128);
1206 auto CASDst = MRI.createGenericVirtualRegister(s128);
1207 auto CASDesired = MRI.createGenericVirtualRegister(s128);
1208 auto CASNew = MRI.createGenericVirtualRegister(s128);
1209 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
1210 .addUse(DesiredI->getOperand(0).getReg())
1211 .addImm(AArch64::sube64)
1212 .addUse(DesiredI->getOperand(1).getReg())
1213 .addImm(AArch64::subo64);
1214 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
1215 .addUse(NewI->getOperand(0).getReg())
1216 .addImm(AArch64::sube64)
1217 .addUse(NewI->getOperand(1).getReg())
1218 .addImm(AArch64::subo64);
1219
1220 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
1221
1222 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
1223 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
1224 } else {
1225 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
1226 // can take arbitrary registers so it just has the normal GPR64 operands the
1227 // rest of AArch64 is expecting.
1228 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1229 unsigned Opcode;
1230 switch (Ordering) {
1231 case AtomicOrdering::Acquire:
1232 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
1233 break;
1234 case AtomicOrdering::Release:
1235 Opcode = AArch64::CMP_SWAP_128_RELEASE;
1236 break;
1237 case AtomicOrdering::AcquireRelease:
1238 case AtomicOrdering::SequentiallyConsistent:
1239 Opcode = AArch64::CMP_SWAP_128;
1240 break;
1241 default:
1242 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
1243 break;
1244 }
1245
1246 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1247 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
1248 {Addr, DesiredI->getOperand(0),
1249 DesiredI->getOperand(1), NewI->getOperand(0),
1250 NewI->getOperand(1)});
1251 }
1252
1253 CAS.cloneMemRefs(MI);
1254 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
1255 *MRI.getTargetRegisterInfo(),
1256 *ST->getRegBankInfo());
1257
1258 MIRBuilder.buildMerge(MI.getOperand(0), {DstLo, DstHi});
1259 MI.eraseFromParent();
1260 return true;
1261 }
1262
legalizeCTTZ(MachineInstr & MI,LegalizerHelper & Helper) const1263 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
1264 LegalizerHelper &Helper) const {
1265 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1266 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1267 LLT Ty = MRI.getType(MI.getOperand(1).getReg());
1268 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
1269 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
1270 MI.eraseFromParent();
1271 return true;
1272 }
1273