1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/GlobalISel/Utils.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetOpcodes.h"
24 #include "llvm/CodeGen/ValueTypes.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/IntrinsicsAArch64.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/Support/MathExtras.h"
29 #include <initializer_list>
30 
31 #define DEBUG_TYPE "aarch64-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 
38 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
39     : ST(&ST) {
40   using namespace TargetOpcode;
41   const LLT p0 = LLT::pointer(0, 64);
42   const LLT s1 = LLT::scalar(1);
43   const LLT s8 = LLT::scalar(8);
44   const LLT s16 = LLT::scalar(16);
45   const LLT s32 = LLT::scalar(32);
46   const LLT s64 = LLT::scalar(64);
47   const LLT s128 = LLT::scalar(128);
48   const LLT s256 = LLT::scalar(256);
49   const LLT v16s8 = LLT::fixed_vector(16, 8);
50   const LLT v8s8 = LLT::fixed_vector(8, 8);
51   const LLT v4s8 = LLT::fixed_vector(4, 8);
52   const LLT v8s16 = LLT::fixed_vector(8, 16);
53   const LLT v4s16 = LLT::fixed_vector(4, 16);
54   const LLT v2s16 = LLT::fixed_vector(2, 16);
55   const LLT v2s32 = LLT::fixed_vector(2, 32);
56   const LLT v4s32 = LLT::fixed_vector(4, 32);
57   const LLT v2s64 = LLT::fixed_vector(2, 64);
58   const LLT v2p0 = LLT::fixed_vector(2, p0);
59 
60   std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
61                                                         v16s8, v8s16, v4s32,
62                                                         v2s64, v2p0,
63                                                         /* End 128bit types */
64                                                         /* Begin 64bit types */
65                                                         v8s8, v4s16, v2s32};
66 
67   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
68 
69   // FIXME: support subtargets which have neon/fp-armv8 disabled.
70   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
71     getLegacyLegalizerInfo().computeTables();
72     return;
73   }
74 
75   // Some instructions only support s16 if the subtarget has full 16-bit FP
76   // support.
77   const bool HasFP16 = ST.hasFullFP16();
78   const LLT &MinFPScalar = HasFP16 ? s16 : s32;
79 
80   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
81       .legalFor({p0, s1, s8, s16, s32, s64})
82       .legalFor(PackedVectorAllTypeList)
83       .clampScalar(0, s8, s64)
84       .widenScalarToNextPow2(0, 8)
85       .fewerElementsIf(
86           [=](const LegalityQuery &Query) {
87             return Query.Types[0].isVector() &&
88                    (Query.Types[0].getElementType() != s64 ||
89                     Query.Types[0].getNumElements() != 2);
90           },
91           [=](const LegalityQuery &Query) {
92             LLT EltTy = Query.Types[0].getElementType();
93             if (EltTy == s64)
94               return std::make_pair(0, LLT::fixed_vector(2, 64));
95             return std::make_pair(0, EltTy);
96           });
97 
98   getActionDefinitionsBuilder(G_PHI).legalFor({p0, s16, s32, s64})
99       .legalFor(PackedVectorAllTypeList)
100       .clampScalar(0, s16, s64)
101       .widenScalarToNextPow2(0);
102 
103   getActionDefinitionsBuilder(G_BSWAP)
104       .legalFor({s32, s64, v4s32, v2s32, v2s64})
105       .clampScalar(0, s32, s64)
106       .widenScalarToNextPow2(0);
107 
108   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
109       .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
110       .scalarizeIf(
111           [=](const LegalityQuery &Query) {
112             return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
113           },
114           0)
115       .legalFor({v2s64})
116       .clampScalar(0, s32, s64)
117       .widenScalarToNextPow2(0)
118       .clampNumElements(0, v2s32, v4s32)
119       .clampNumElements(0, v2s64, v2s64)
120       .moreElementsToNextPow2(0);
121 
122   getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
123       .customIf([=](const LegalityQuery &Query) {
124         const auto &SrcTy = Query.Types[0];
125         const auto &AmtTy = Query.Types[1];
126         return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
127                AmtTy.getSizeInBits() == 32;
128       })
129       .legalFor({
130           {s32, s32},
131           {s32, s64},
132           {s64, s64},
133           {v8s8, v8s8},
134           {v16s8, v16s8},
135           {v4s16, v4s16},
136           {v8s16, v8s16},
137           {v2s32, v2s32},
138           {v4s32, v4s32},
139           {v2s64, v2s64},
140       })
141       .widenScalarToNextPow2(0)
142       .clampScalar(1, s32, s64)
143       .clampScalar(0, s32, s64)
144       .clampNumElements(0, v2s32, v4s32)
145       .clampNumElements(0, v2s64, v2s64)
146       .moreElementsToNextPow2(0)
147       .minScalarSameAs(1, 0);
148 
149   getActionDefinitionsBuilder(G_PTR_ADD)
150       .legalFor({{p0, s64}, {v2p0, v2s64}})
151       .clampScalar(1, s64, s64);
152 
153   getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
154 
155   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
156       .legalFor({s32, s64})
157       .libcallFor({s128})
158       .clampScalar(0, s32, s64)
159       .widenScalarToNextPow2(0)
160       .scalarize(0);
161 
162   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
163       .lowerFor({s1, s8, s16, s32, s64});
164 
165   getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}});
166 
167   getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
168 
169   getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
170       .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
171       .clampNumElements(0, v8s8, v16s8)
172       .clampNumElements(0, v4s16, v8s16)
173       .clampNumElements(0, v2s32, v4s32)
174       // FIXME: This sholdn't be needed as v2s64 types are going to
175       // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
176       .clampNumElements(0, v2s64, v2s64)
177       .lower();
178 
179   getActionDefinitionsBuilder(
180       {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
181       .legalFor({{s32, s1}, {s64, s1}})
182       .clampScalar(0, s32, s64)
183       .widenScalarToNextPow2(0);
184 
185   getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
186       .legalFor({s32, s64, v2s64, v4s32, v2s32})
187       .clampNumElements(0, v2s32, v4s32)
188       .clampNumElements(0, v2s64, v2s64);
189 
190   getActionDefinitionsBuilder(G_FREM).libcallFor({s32, s64});
191 
192   getActionDefinitionsBuilder({G_FCEIL, G_FABS, G_FSQRT, G_FFLOOR, G_FRINT,
193                                G_FMA, G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND,
194                                G_FNEARBYINT, G_INTRINSIC_LRINT})
195       // If we don't have full FP16 support, then scalarize the elements of
196       // vectors containing fp16 types.
197       .fewerElementsIf(
198           [=, &ST](const LegalityQuery &Query) {
199             const auto &Ty = Query.Types[0];
200             return Ty.isVector() && Ty.getElementType() == s16 &&
201                    !ST.hasFullFP16();
202           },
203           [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
204       // If we don't have full FP16 support, then widen s16 to s32 if we
205       // encounter it.
206       .widenScalarIf(
207           [=, &ST](const LegalityQuery &Query) {
208             return Query.Types[0] == s16 && !ST.hasFullFP16();
209           },
210           [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
211       .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});
212 
213   getActionDefinitionsBuilder(
214       {G_FCOS, G_FSIN, G_FLOG10, G_FLOG, G_FLOG2, G_FEXP, G_FEXP2, G_FPOW})
215       // We need a call for these, so we always need to scalarize.
216       .scalarize(0)
217       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
218       .minScalar(0, s32)
219       .libcallFor({s32, s64, v2s32, v4s32, v2s64});
220 
221   getActionDefinitionsBuilder(G_INSERT)
222       .unsupportedIf([=](const LegalityQuery &Query) {
223         return Query.Types[0].getSizeInBits() <= Query.Types[1].getSizeInBits();
224       })
225       .legalIf([=](const LegalityQuery &Query) {
226         const LLT &Ty0 = Query.Types[0];
227         const LLT &Ty1 = Query.Types[1];
228         if (Ty0 != s32 && Ty0 != s64 && Ty0 != p0)
229           return false;
230         return isPowerOf2_32(Ty1.getSizeInBits()) &&
231                (Ty1.getSizeInBits() == 1 || Ty1.getSizeInBits() >= 8);
232       })
233       .clampScalar(0, s32, s64)
234       .widenScalarToNextPow2(0)
235       .maxScalarIf(typeInSet(0, {s32}), 1, s16)
236       .maxScalarIf(typeInSet(0, {s64}), 1, s32)
237       .widenScalarToNextPow2(1);
238 
239   getActionDefinitionsBuilder(G_EXTRACT)
240       .unsupportedIf([=](const LegalityQuery &Query) {
241         return Query.Types[0].getSizeInBits() >= Query.Types[1].getSizeInBits();
242       })
243       .legalIf([=](const LegalityQuery &Query) {
244         const LLT &Ty0 = Query.Types[0];
245         const LLT &Ty1 = Query.Types[1];
246         if (Ty1 != s32 && Ty1 != s64 && Ty1 != s128)
247           return false;
248         if (Ty1 == p0)
249           return true;
250         return isPowerOf2_32(Ty0.getSizeInBits()) &&
251                (Ty0.getSizeInBits() == 1 || Ty0.getSizeInBits() >= 8);
252       })
253       .clampScalar(1, s32, s128)
254       .widenScalarToNextPow2(1)
255       .maxScalarIf(typeInSet(1, {s32}), 0, s16)
256       .maxScalarIf(typeInSet(1, {s64}), 0, s32)
257       .widenScalarToNextPow2(0);
258 
259   getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
260       .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
261       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
262                                  {s32, p0, s16, 8},
263                                  {s32, p0, s32, 8},
264                                  {s64, p0, s8, 2},
265                                  {s64, p0, s16, 2},
266                                  {s64, p0, s32, 4},
267                                  {s64, p0, s64, 8},
268                                  {p0, p0, s64, 8},
269                                  {v2s32, p0, s64, 8}})
270       .clampScalar(0, s32, s64)
271       .widenScalarToNextPow2(0)
272       // TODO: We could support sum-of-pow2's but the lowering code doesn't know
273       //       how to do that yet.
274       .unsupportedIfMemSizeNotPow2()
275       // Lower anything left over into G_*EXT and G_LOAD
276       .lower();
277 
278   auto IsPtrVecPred = [=](const LegalityQuery &Query) {
279     const LLT &ValTy = Query.Types[0];
280     if (!ValTy.isVector())
281       return false;
282     const LLT EltTy = ValTy.getElementType();
283     return EltTy.isPointer() && EltTy.getAddressSpace() == 0;
284   };
285 
286   getActionDefinitionsBuilder(G_LOAD)
287       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
288                                  {s16, p0, s16, 8},
289                                  {s32, p0, s32, 8},
290                                  {s64, p0, s64, 8},
291                                  {p0, p0, s64, 8},
292                                  {s128, p0, s128, 8},
293                                  {v8s8, p0, s64, 8},
294                                  {v16s8, p0, s128, 8},
295                                  {v4s16, p0, s64, 8},
296                                  {v8s16, p0, s128, 8},
297                                  {v2s32, p0, s64, 8},
298                                  {v4s32, p0, s128, 8},
299                                  {v2s64, p0, s128, 8}})
300       // These extends are also legal
301       .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}})
302       .clampScalar(0, s8, s64)
303       .lowerIfMemSizeNotPow2()
304       .widenScalarToNextPow2(0)
305       .narrowScalarIf([=](const LegalityQuery &Query) {
306         // Clamp extending load results to 32-bits.
307         return Query.Types[0].isScalar() &&
308           Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
309           Query.Types[0].getSizeInBits() > 32;
310         },
311         changeTo(0, s32))
312       // Lower any any-extending loads left into G_ANYEXT and G_LOAD
313       .lowerIf([=](const LegalityQuery &Query) {
314         return Query.Types[0] != Query.MMODescrs[0].MemoryTy;
315       })
316       .clampMaxNumElements(0, s8, 16)
317       .clampMaxNumElements(0, s16, 8)
318       .clampMaxNumElements(0, s32, 4)
319       .clampMaxNumElements(0, s64, 2)
320       .customIf(IsPtrVecPred)
321       .scalarizeIf(typeIs(0, v2s16), 0);
322 
323   getActionDefinitionsBuilder(G_STORE)
324       .legalForTypesWithMemDesc({{s8, p0, s8, 8},
325                                  {s16, p0, s8, 8}, // truncstorei8 from s16
326                                  {s32, p0, s8, 8}, // truncstorei8 from s32
327                                  {s64, p0, s8, 8}, // truncstorei8 from s64
328                                  {s16, p0, s16, 8},
329                                  {s32, p0, s16, 8}, // truncstorei16 from s32
330                                  {s64, p0, s16, 8}, // truncstorei16 from s64
331                                  {s32, p0, s8, 8},
332                                  {s32, p0, s16, 8},
333                                  {s32, p0, s32, 8},
334                                  {s64, p0, s64, 8},
335                                  {s64, p0, s32, 8}, // truncstorei32 from s64
336                                  {p0, p0, s64, 8},
337                                  {s128, p0, s128, 8},
338                                  {v16s8, p0, s128, 8},
339                                  {v8s8, p0, s64, 8},
340                                  {v4s16, p0, s64, 8},
341                                  {v8s16, p0, s128, 8},
342                                  {v2s32, p0, s64, 8},
343                                  {v4s32, p0, s128, 8},
344                                  {v2s64, p0, s128, 8}})
345       .clampScalar(0, s8, s64)
346       .lowerIf([=](const LegalityQuery &Query) {
347         return Query.Types[0].isScalar() &&
348                Query.Types[0] != Query.MMODescrs[0].MemoryTy;
349       })
350       // Maximum: sN * k = 128
351       .clampMaxNumElements(0, s8, 16)
352       .clampMaxNumElements(0, s16, 8)
353       .clampMaxNumElements(0, s32, 4)
354       .clampMaxNumElements(0, s64, 2)
355       .lowerIfMemSizeNotPow2()
356       .customIf(IsPtrVecPred)
357       .scalarizeIf(typeIs(0, v2s16), 0);
358 
359   // Constants
360   getActionDefinitionsBuilder(G_CONSTANT)
361       .legalFor({p0, s8, s16, s32, s64})
362       .clampScalar(0, s8, s64)
363       .widenScalarToNextPow2(0);
364   getActionDefinitionsBuilder(G_FCONSTANT)
365       .legalIf([=](const LegalityQuery &Query) {
366         const auto &Ty = Query.Types[0];
367         if (HasFP16 && Ty == s16)
368           return true;
369         return Ty == s32 || Ty == s64 || Ty == s128;
370       })
371       .clampScalar(0, MinFPScalar, s128);
372 
373   getActionDefinitionsBuilder({G_ICMP, G_FCMP})
374       .legalFor({{s32, s32},
375                  {s32, s64},
376                  {s32, p0},
377                  {v4s32, v4s32},
378                  {v2s32, v2s32},
379                  {v2s64, v2s64},
380                  {v2s64, v2p0},
381                  {v4s16, v4s16},
382                  {v8s16, v8s16},
383                  {v8s8, v8s8},
384                  {v16s8, v16s8}})
385       .clampScalar(1, s32, s64)
386       .clampScalar(0, s32, s32)
387       .minScalarEltSameAsIf(
388           [=](const LegalityQuery &Query) {
389             const LLT &Ty = Query.Types[0];
390             const LLT &SrcTy = Query.Types[1];
391             return Ty.isVector() && !SrcTy.getElementType().isPointer() &&
392                    Ty.getElementType() != SrcTy.getElementType();
393           },
394           0, 1)
395       .minScalarOrEltIf(
396           [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
397           1, s32)
398       .minScalarOrEltIf(
399           [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
400           s64)
401       .widenScalarOrEltToNextPow2(1)
402       .clampNumElements(0, v2s32, v4s32);
403 
404   // Extensions
405   auto ExtLegalFunc = [=](const LegalityQuery &Query) {
406     unsigned DstSize = Query.Types[0].getSizeInBits();
407 
408     if (DstSize == 128 && !Query.Types[0].isVector())
409       return false; // Extending to a scalar s128 needs narrowing.
410 
411     // Make sure that we have something that will fit in a register, and
412     // make sure it's a power of 2.
413     if (DstSize < 8 || DstSize > 128 || !isPowerOf2_32(DstSize))
414       return false;
415 
416     const LLT &SrcTy = Query.Types[1];
417 
418     // Special case for s1.
419     if (SrcTy == s1)
420       return true;
421 
422     // Make sure we fit in a register otherwise. Don't bother checking that
423     // the source type is below 128 bits. We shouldn't be allowing anything
424     // through which is wider than the destination in the first place.
425     unsigned SrcSize = SrcTy.getSizeInBits();
426     if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
427       return false;
428 
429     return true;
430   };
431   getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
432       .legalIf(ExtLegalFunc)
433       .clampScalar(0, s64, s64); // Just for s128, others are handled above.
434 
435   getActionDefinitionsBuilder(G_TRUNC)
436       .minScalarOrEltIf(
437           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
438           0, s8)
439       .customIf([=](const LegalityQuery &Query) {
440         LLT DstTy = Query.Types[0];
441         LLT SrcTy = Query.Types[1];
442         return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
443       })
444       .alwaysLegal();
445 
446   getActionDefinitionsBuilder(G_SEXT_INREG).legalFor({s32, s64}).lower();
447 
448   // FP conversions
449   getActionDefinitionsBuilder(G_FPTRUNC)
450       .legalFor(
451           {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
452       .clampMaxNumElements(0, s32, 2);
453   getActionDefinitionsBuilder(G_FPEXT)
454       .legalFor(
455           {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
456       .clampMaxNumElements(0, s64, 2);
457 
458   // Conversions
459   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
460       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
461       .clampScalar(0, s32, s64)
462       .widenScalarToNextPow2(0)
463       .clampScalar(1, s32, s64)
464       .widenScalarToNextPow2(1);
465 
466   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
467       .legalForCartesianProduct({s32, s64, v2s64, v4s32, v2s32})
468       .clampScalar(1, s32, s64)
469       .minScalarSameAs(1, 0)
470       .clampScalar(0, s32, s64)
471       .widenScalarToNextPow2(0);
472 
473   // Control-flow
474   getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s8, s16, s32});
475   getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
476 
477   getActionDefinitionsBuilder(G_SELECT)
478       .legalFor({{s32, s1}, {s64, s1}, {p0, s1}})
479       .clampScalar(0, s32, s64)
480       .widenScalarToNextPow2(0)
481       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
482       .lowerIf(isVector(0));
483 
484   // Pointer-handling
485   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
486 
487   if (TM.getCodeModel() == CodeModel::Small)
488     getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
489   else
490     getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
491 
492   getActionDefinitionsBuilder(G_PTRTOINT)
493       .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
494       .maxScalar(0, s64)
495       .widenScalarToNextPow2(0, /*Min*/ 8);
496 
497   getActionDefinitionsBuilder(G_INTTOPTR)
498       .unsupportedIf([&](const LegalityQuery &Query) {
499         return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
500       })
501       .legalFor({{p0, s64}, {v2p0, v2s64}});
502 
503   // Casts for 32 and 64-bit width type are just copies.
504   // Same for 128-bit width type, except they are on the FPR bank.
505   getActionDefinitionsBuilder(G_BITCAST)
506       // FIXME: This is wrong since G_BITCAST is not allowed to change the
507       // number of bits but it's what the previous code described and fixing
508       // it breaks tests.
509       .legalForCartesianProduct({s1, s8, s16, s32, s64, s128, v16s8, v8s8, v4s8,
510                                  v8s16, v4s16, v2s16, v4s32, v2s32, v2s64,
511                                  v2p0});
512 
513   getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
514 
515   // va_list must be a pointer, but most sized types are pretty easy to handle
516   // as the destination.
517   getActionDefinitionsBuilder(G_VAARG)
518       .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
519       .clampScalar(0, s8, s64)
520       .widenScalarToNextPow2(0, /*Min*/ 8);
521 
522   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
523       .lowerIf(
524           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, s1), typeIs(2, p0)));
525 
526   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
527       .customIf([](const LegalityQuery &Query) {
528         return Query.Types[0].getSizeInBits() == 128;
529       })
530       .clampScalar(0, s32, s64)
531       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
532 
533   getActionDefinitionsBuilder(
534       {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND,
535        G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX,
536        G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
537       .clampScalar(0, s32, s64)
538       .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)));
539 
540   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
541 
542   // Merge/Unmerge
543   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
544     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
545     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
546 
547     auto notValidElt = [](const LegalityQuery &Query, unsigned TypeIdx) {
548       const LLT &Ty = Query.Types[TypeIdx];
549       if (Ty.isVector()) {
550         const LLT &EltTy = Ty.getElementType();
551         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
552           return true;
553         if (!isPowerOf2_32(EltTy.getSizeInBits()))
554           return true;
555       }
556       return false;
557     };
558 
559     // FIXME: This rule is horrible, but specifies the same as what we had
560     // before with the particularly strange definitions removed (e.g.
561     // s8 = G_MERGE_VALUES s32, s32).
562     // Part of the complexity comes from these ops being extremely flexible. For
563     // example, you can build/decompose vectors with it, concatenate vectors,
564     // etc. and in addition to this you can also bitcast with it at the same
565     // time. We've been considering breaking it up into multiple ops to make it
566     // more manageable throughout the backend.
567     getActionDefinitionsBuilder(Op)
568         // Break up vectors with weird elements into scalars
569         .fewerElementsIf(
570             [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
571             scalarize(0))
572         .fewerElementsIf(
573             [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
574             scalarize(1))
575         // Clamp the big scalar to s8-s128 and make it a power of 2.
576         .clampScalar(BigTyIdx, s8, s128)
577         .widenScalarIf(
578             [=](const LegalityQuery &Query) {
579               const LLT &Ty = Query.Types[BigTyIdx];
580               return !isPowerOf2_32(Ty.getSizeInBits()) &&
581                      Ty.getSizeInBits() % 64 != 0;
582             },
583             [=](const LegalityQuery &Query) {
584               // Pick the next power of 2, or a multiple of 64 over 128.
585               // Whichever is smaller.
586               const LLT &Ty = Query.Types[BigTyIdx];
587               unsigned NewSizeInBits = 1
588                                        << Log2_32_Ceil(Ty.getSizeInBits() + 1);
589               if (NewSizeInBits >= 256) {
590                 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
591                 if (RoundedTo < NewSizeInBits)
592                   NewSizeInBits = RoundedTo;
593               }
594               return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
595             })
596         // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
597         // worth considering the multiples of 64 since 2*192 and 2*384 are not
598         // valid.
599         .clampScalar(LitTyIdx, s8, s256)
600         .widenScalarToNextPow2(LitTyIdx, /*Min*/ 8)
601         // So at this point, we have s8, s16, s32, s64, s128, s192, s256, s384,
602         // s512, <X x s8>, <X x s16>, <X x s32>, or <X x s64>.
603         // At this point it's simple enough to accept the legal types.
604         .legalIf([=](const LegalityQuery &Query) {
605           const LLT &BigTy = Query.Types[BigTyIdx];
606           const LLT &LitTy = Query.Types[LitTyIdx];
607           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
608             return false;
609           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
610             return false;
611           return BigTy.getSizeInBits() % LitTy.getSizeInBits() == 0;
612         })
613         // Any vectors left are the wrong size. Scalarize them.
614         .scalarize(0)
615         .scalarize(1);
616   }
617 
618   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
619       .unsupportedIf([=](const LegalityQuery &Query) {
620         const LLT &EltTy = Query.Types[1].getElementType();
621         return Query.Types[0] != EltTy;
622       })
623       .minScalar(2, s64)
624       .legalIf([=](const LegalityQuery &Query) {
625         const LLT &VecTy = Query.Types[1];
626         return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
627                VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
628                VecTy == v16s8 || VecTy == v2s32 || VecTy == v2p0;
629       })
630       .minScalarOrEltIf(
631           [=](const LegalityQuery &Query) {
632             // We want to promote to <M x s1> to <M x s64> if that wouldn't
633             // cause the total vec size to be > 128b.
634             return Query.Types[1].getNumElements() <= 2;
635           },
636           0, s64)
637       .minScalarOrEltIf(
638           [=](const LegalityQuery &Query) {
639             return Query.Types[1].getNumElements() <= 4;
640           },
641           0, s32)
642       .minScalarOrEltIf(
643           [=](const LegalityQuery &Query) {
644             return Query.Types[1].getNumElements() <= 8;
645           },
646           0, s16)
647       .minScalarOrEltIf(
648           [=](const LegalityQuery &Query) {
649             return Query.Types[1].getNumElements() <= 16;
650           },
651           0, s8)
652       .minScalarOrElt(0, s8) // Worst case, we need at least s8.
653       .clampMaxNumElements(1, s64, 2)
654       .clampMaxNumElements(1, s32, 4)
655       .clampMaxNumElements(1, s16, 8)
656       .clampMaxNumElements(1, p0, 2);
657 
658   getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
659       .legalIf(typeInSet(0, {v8s16, v2s32, v4s32, v2s64}));
660 
661   getActionDefinitionsBuilder(G_BUILD_VECTOR)
662       .legalFor({{v8s8, s8},
663                  {v16s8, s8},
664                  {v2s16, s16},
665                  {v4s16, s16},
666                  {v8s16, s16},
667                  {v2s32, s32},
668                  {v4s32, s32},
669                  {v2p0, p0},
670                  {v2s64, s64}})
671       .clampNumElements(0, v4s32, v4s32)
672       .clampNumElements(0, v2s64, v2s64)
673       .minScalarSameAs(1, 0);
674 
675   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
676 
677   getActionDefinitionsBuilder(G_CTLZ)
678       .legalForCartesianProduct(
679           {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
680       .scalarize(1);
681   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
682 
683   // TODO: Custom lowering for v2s32, v4s32, v2s64.
684   getActionDefinitionsBuilder(G_BITREVERSE).legalFor({s32, s64, v8s8, v16s8});
685 
686   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
687 
688   // TODO: Handle vector types.
689   getActionDefinitionsBuilder(G_CTTZ)
690       .clampScalar(0, s32, s64)
691       .scalarSameSizeAs(1, 0)
692       .customFor({s32, s64});
693 
694   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
695       .legalIf([=](const LegalityQuery &Query) {
696         const LLT &DstTy = Query.Types[0];
697         const LLT &SrcTy = Query.Types[1];
698         // For now just support the TBL2 variant which needs the source vectors
699         // to be the same size as the dest.
700         if (DstTy != SrcTy)
701           return false;
702         for (auto &Ty : {v2s32, v4s32, v2s64, v2p0, v16s8, v8s16}) {
703           if (DstTy == Ty)
704             return true;
705         }
706         return false;
707       })
708       // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
709       // just want those lowered into G_BUILD_VECTOR
710       .lowerIf([=](const LegalityQuery &Query) {
711         return !Query.Types[1].isVector();
712       })
713       .moreElementsToNextPow2(0)
714       .clampNumElements(0, v4s32, v4s32)
715       .clampNumElements(0, v2s64, v2s64);
716 
717   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
718       .legalFor({{v4s32, v2s32}, {v8s16, v4s16}});
719 
720   getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({{p0}, {s64}});
721 
722   getActionDefinitionsBuilder(G_BRJT).legalIf([=](const LegalityQuery &Query) {
723     return Query.Types[0] == p0 && Query.Types[1] == s64;
724   });
725 
726   getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
727 
728   getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
729       .libcall();
730 
731   // FIXME: Legal types are only legal with NEON.
732   getActionDefinitionsBuilder(G_ABS)
733       .lowerIf(isScalar(0))
734       .legalFor(PackedVectorAllTypeList);
735 
736   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
737       // We only have FADDP to do reduction-like operations. Lower the rest.
738       .legalFor({{s32, v2s32}, {s64, v2s64}})
739       .clampMaxNumElements(1, s64, 2)
740       .clampMaxNumElements(1, s32, 2)
741       .lower();
742 
743   getActionDefinitionsBuilder(G_VECREDUCE_ADD)
744       .legalFor(
745           {{s8, v16s8}, {s16, v8s16}, {s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
746       .clampMaxNumElements(1, s64, 2)
747       .clampMaxNumElements(1, s32, 4)
748       .lower();
749 
750   getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
751       .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
752 
753   getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower();
754 
755   getActionDefinitionsBuilder(G_ROTR)
756       .legalFor({{s32, s64}, {s64, s64}})
757       .customIf([=](const LegalityQuery &Q) {
758         return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
759       })
760       .lower();
761   getActionDefinitionsBuilder(G_ROTL).lower();
762 
763   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
764       .customFor({{s32, s32}, {s64, s64}});
765 
766   // TODO: Custom legalization for s128
767   // TODO: Use generic lowering when custom lowering is not possible.
768   auto always = [=](const LegalityQuery &Q) { return true; };
769   getActionDefinitionsBuilder(G_CTPOP)
770       .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
771       .clampScalar(0, s32, s128)
772       .widenScalarToNextPow2(0)
773       .minScalarEltSameAsIf(always, 1, 0)
774       .maxScalarEltSameAsIf(always, 1, 0)
775       .customFor({{s32, s32},
776                   {s64, s64},
777                   {v2s64, v2s64},
778                   {v2s32, v2s32},
779                   {v4s32, v4s32},
780                   {v4s16, v4s16},
781                   {v8s16, v8s16}});
782 
783   getLegacyLegalizerInfo().computeTables();
784   verify(*ST.getInstrInfo());
785 }
786 
787 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
788                                           MachineInstr &MI) const {
789   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
790   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
791   GISelChangeObserver &Observer = Helper.Observer;
792   switch (MI.getOpcode()) {
793   default:
794     // No idea what to do.
795     return false;
796   case TargetOpcode::G_VAARG:
797     return legalizeVaArg(MI, MRI, MIRBuilder);
798   case TargetOpcode::G_LOAD:
799   case TargetOpcode::G_STORE:
800     return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
801   case TargetOpcode::G_SHL:
802   case TargetOpcode::G_ASHR:
803   case TargetOpcode::G_LSHR:
804     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
805   case TargetOpcode::G_GLOBAL_VALUE:
806     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
807   case TargetOpcode::G_TRUNC:
808     return legalizeVectorTrunc(MI, Helper);
809   case TargetOpcode::G_SBFX:
810   case TargetOpcode::G_UBFX:
811     return legalizeBitfieldExtract(MI, MRI, Helper);
812   case TargetOpcode::G_ROTR:
813     return legalizeRotate(MI, MRI, Helper);
814   case TargetOpcode::G_CTPOP:
815     return legalizeCTPOP(MI, MRI, Helper);
816   case TargetOpcode::G_ATOMIC_CMPXCHG:
817     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
818   case TargetOpcode::G_CTTZ:
819     return legalizeCTTZ(MI, Helper);
820   }
821 
822   llvm_unreachable("expected switch to return");
823 }
824 
825 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
826                                           MachineRegisterInfo &MRI,
827                                           LegalizerHelper &Helper) const {
828   // To allow for imported patterns to match, we ensure that the rotate amount
829   // is 64b with an extension.
830   Register AmtReg = MI.getOperand(2).getReg();
831   LLT AmtTy = MRI.getType(AmtReg);
832   (void)AmtTy;
833   assert(AmtTy.isScalar() && "Expected a scalar rotate");
834   assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
835   auto NewAmt = Helper.MIRBuilder.buildSExt(LLT::scalar(64), AmtReg);
836   Helper.Observer.changingInstr(MI);
837   MI.getOperand(2).setReg(NewAmt.getReg(0));
838   Helper.Observer.changedInstr(MI);
839   return true;
840 }
841 
842 static void extractParts(Register Reg, MachineRegisterInfo &MRI,
843                          MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
844                          SmallVectorImpl<Register> &VRegs) {
845   for (int I = 0; I < NumParts; ++I)
846     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
847   MIRBuilder.buildUnmerge(VRegs, Reg);
848 }
849 
850 bool AArch64LegalizerInfo::legalizeVectorTrunc(
851     MachineInstr &MI, LegalizerHelper &Helper) const {
852   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
853   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
854   // Similar to how operand splitting is done in SelectiondDAG, we can handle
855   // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
856   //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
857   //   %lo16(<4 x s16>) = G_TRUNC %inlo
858   //   %hi16(<4 x s16>) = G_TRUNC %inhi
859   //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
860   //   %res(<8 x s8>) = G_TRUNC %in16
861 
862   Register DstReg = MI.getOperand(0).getReg();
863   Register SrcReg = MI.getOperand(1).getReg();
864   LLT DstTy = MRI.getType(DstReg);
865   LLT SrcTy = MRI.getType(SrcReg);
866   assert(isPowerOf2_32(DstTy.getSizeInBits()) &&
867          isPowerOf2_32(SrcTy.getSizeInBits()));
868 
869   // Split input type.
870   LLT SplitSrcTy =
871       SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2));
872   // First, split the source into two smaller vectors.
873   SmallVector<Register, 2> SplitSrcs;
874   extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
875 
876   // Truncate the splits into intermediate narrower elements.
877   LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
878   for (unsigned I = 0; I < SplitSrcs.size(); ++I)
879     SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
880 
881   auto Concat = MIRBuilder.buildConcatVectors(
882       DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
883 
884   Helper.Observer.changingInstr(MI);
885   MI.getOperand(1).setReg(Concat.getReg(0));
886   Helper.Observer.changedInstr(MI);
887   return true;
888 }
889 
890 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
891     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
892     GISelChangeObserver &Observer) const {
893   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
894   // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
895   // G_ADD_LOW instructions.
896   // By splitting this here, we can optimize accesses in the small code model by
897   // folding in the G_ADD_LOW into the load/store offset.
898   auto &GlobalOp = MI.getOperand(1);
899   const auto* GV = GlobalOp.getGlobal();
900   if (GV->isThreadLocal())
901     return true; // Don't want to modify TLS vars.
902 
903   auto &TM = ST->getTargetLowering()->getTargetMachine();
904   unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
905 
906   if (OpFlags & AArch64II::MO_GOT)
907     return true;
908 
909   auto Offset = GlobalOp.getOffset();
910   Register DstReg = MI.getOperand(0).getReg();
911   auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
912                   .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
913   // Set the regclass on the dest reg too.
914   MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
915 
916   // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
917   // by creating a MOVK that sets bits 48-63 of the register to (global address
918   // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
919   // prevent an incorrect tag being generated during relocation when the the
920   // global appears before the code section. Without the offset, a global at
921   // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
922   // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
923   // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
924   // instead of `0xf`.
925   // This assumes that we're in the small code model so we can assume a binary
926   // size of <= 4GB, which makes the untagged PC relative offset positive. The
927   // binary must also be loaded into address range [0, 2^48). Both of these
928   // properties need to be ensured at runtime when using tagged addresses.
929   if (OpFlags & AArch64II::MO_TAGGED) {
930     assert(!Offset &&
931            "Should not have folded in an offset for a tagged global!");
932     ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
933                .addGlobalAddress(GV, 0x100000000,
934                                  AArch64II::MO_PREL | AArch64II::MO_G3)
935                .addImm(48);
936     MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
937   }
938 
939   MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
940       .addGlobalAddress(GV, Offset,
941                         OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
942   MI.eraseFromParent();
943   return true;
944 }
945 
946 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
947                                              MachineInstr &MI) const {
948   return true;
949 }
950 
951 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
952     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
953     GISelChangeObserver &Observer) const {
954   assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
955          MI.getOpcode() == TargetOpcode::G_LSHR ||
956          MI.getOpcode() == TargetOpcode::G_SHL);
957   // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
958   // imported patterns can select it later. Either way, it will be legal.
959   Register AmtReg = MI.getOperand(2).getReg();
960   auto VRegAndVal = getConstantVRegValWithLookThrough(AmtReg, MRI);
961   if (!VRegAndVal)
962     return true;
963   // Check the shift amount is in range for an immediate form.
964   int64_t Amount = VRegAndVal->Value.getSExtValue();
965   if (Amount > 31)
966     return true; // This will have to remain a register variant.
967   auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
968   Observer.changingInstr(MI);
969   MI.getOperand(2).setReg(ExtCst.getReg(0));
970   Observer.changedInstr(MI);
971   return true;
972 }
973 
974 // FIXME: This should be removed and replaced with the generic bitcast legalize
975 // action.
976 bool AArch64LegalizerInfo::legalizeLoadStore(
977     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
978     GISelChangeObserver &Observer) const {
979   assert(MI.getOpcode() == TargetOpcode::G_STORE ||
980          MI.getOpcode() == TargetOpcode::G_LOAD);
981   // Here we just try to handle vector loads/stores where our value type might
982   // have pointer elements, which the SelectionDAG importer can't handle. To
983   // allow the existing patterns for s64 to fire for p0, we just try to bitcast
984   // the value to use s64 types.
985 
986   // Custom legalization requires the instruction, if not deleted, must be fully
987   // legalized. In order to allow further legalization of the inst, we create
988   // a new instruction and erase the existing one.
989 
990   Register ValReg = MI.getOperand(0).getReg();
991   const LLT ValTy = MRI.getType(ValReg);
992 
993   if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
994       ValTy.getElementType().getAddressSpace() != 0) {
995     LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
996     return false;
997   }
998 
999   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1000   const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1001   auto &MMO = **MI.memoperands_begin();
1002   MMO.setType(NewTy);
1003 
1004   if (MI.getOpcode() == TargetOpcode::G_STORE) {
1005     auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1006     MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1007   } else {
1008     auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1009     MIRBuilder.buildBitcast(ValReg, NewLoad);
1010   }
1011   MI.eraseFromParent();
1012   return true;
1013 }
1014 
1015 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1016                                          MachineRegisterInfo &MRI,
1017                                          MachineIRBuilder &MIRBuilder) const {
1018   MachineFunction &MF = MIRBuilder.getMF();
1019   Align Alignment(MI.getOperand(2).getImm());
1020   Register Dst = MI.getOperand(0).getReg();
1021   Register ListPtr = MI.getOperand(1).getReg();
1022 
1023   LLT PtrTy = MRI.getType(ListPtr);
1024   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1025 
1026   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1027   const Align PtrAlign = Align(PtrSize);
1028   auto List = MIRBuilder.buildLoad(
1029       PtrTy, ListPtr,
1030       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1031                                PtrTy, PtrAlign));
1032 
1033   MachineInstrBuilder DstPtr;
1034   if (Alignment > PtrAlign) {
1035     // Realign the list to the actual required alignment.
1036     auto AlignMinus1 =
1037         MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1038     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1039     DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1040   } else
1041     DstPtr = List;
1042 
1043   LLT ValTy = MRI.getType(Dst);
1044   uint64_t ValSize = ValTy.getSizeInBits() / 8;
1045   MIRBuilder.buildLoad(
1046       Dst, DstPtr,
1047       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1048                                ValTy, std::max(Alignment, PtrAlign)));
1049 
1050   auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1051 
1052   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1053 
1054   MIRBuilder.buildStore(NewList, ListPtr,
1055                         *MF.getMachineMemOperand(MachinePointerInfo(),
1056                                                  MachineMemOperand::MOStore,
1057                                                  PtrTy, PtrAlign));
1058 
1059   MI.eraseFromParent();
1060   return true;
1061 }
1062 
1063 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1064     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1065   // Only legal if we can select immediate forms.
1066   // TODO: Lower this otherwise.
1067   return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1068          getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1069 }
1070 
1071 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1072                                          MachineRegisterInfo &MRI,
1073                                          LegalizerHelper &Helper) const {
1074   // While there is no integer popcount instruction, it can
1075   // be more efficiently lowered to the following sequence that uses
1076   // AdvSIMD registers/instructions as long as the copies to/from
1077   // the AdvSIMD registers are cheap.
1078   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
1079   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
1080   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
1081   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
1082   //
1083   // For 128 bit vector popcounts, we lower to the following sequence:
1084   //  cnt.16b   v0, v0  // v8s16, v4s32, v2s64
1085   //  uaddlp.8h v0, v0  // v8s16, v4s32, v2s64
1086   //  uaddlp.4s v0, v0  //        v4s32, v2s64
1087   //  uaddlp.2d v0, v0  //               v2s64
1088   //
1089   // For 64 bit vector popcounts, we lower to the following sequence:
1090   //  cnt.8b    v0, v0  // v4s16, v2s32
1091   //  uaddlp.4h v0, v0  // v4s16, v2s32
1092   //  uaddlp.2s v0, v0  //        v2s32
1093 
1094   if (!ST->hasNEON() ||
1095       MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat))
1096     return false;
1097   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1098   Register Dst = MI.getOperand(0).getReg();
1099   Register Val = MI.getOperand(1).getReg();
1100   LLT Ty = MRI.getType(Val);
1101 
1102   assert(Ty == MRI.getType(Dst) &&
1103          "Expected src and dst to have the same type!");
1104   unsigned Size = Ty.getSizeInBits();
1105 
1106   // Pre-conditioning: widen Val up to the nearest vector type.
1107   // s32,s64,v4s16,v2s32 -> v8i8
1108   // v8s16,v4s32,v2s64 -> v16i8
1109   LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1110   if (Ty.isScalar()) {
1111     // TODO: Handle s128.
1112     assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!");
1113     if (Size == 32) {
1114       Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1115     }
1116   }
1117   Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
1118 
1119   // Count bits in each byte-sized lane.
1120   auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
1121 
1122   // Sum across lanes.
1123   Register HSum = CTPOP.getReg(0);
1124   unsigned Opc;
1125   SmallVector<LLT> HAddTys;
1126   if (Ty.isScalar()) {
1127     Opc = Intrinsic::aarch64_neon_uaddlv;
1128     HAddTys.push_back(LLT::scalar(32));
1129   } else if (Ty == LLT::fixed_vector(8, 16)) {
1130     Opc = Intrinsic::aarch64_neon_uaddlp;
1131     HAddTys.push_back(LLT::fixed_vector(8, 16));
1132   } else if (Ty == LLT::fixed_vector(4, 32)) {
1133     Opc = Intrinsic::aarch64_neon_uaddlp;
1134     HAddTys.push_back(LLT::fixed_vector(8, 16));
1135     HAddTys.push_back(LLT::fixed_vector(4, 32));
1136   } else if (Ty == LLT::fixed_vector(2, 64)) {
1137     Opc = Intrinsic::aarch64_neon_uaddlp;
1138     HAddTys.push_back(LLT::fixed_vector(8, 16));
1139     HAddTys.push_back(LLT::fixed_vector(4, 32));
1140     HAddTys.push_back(LLT::fixed_vector(2, 64));
1141   } else if (Ty == LLT::fixed_vector(4, 16)) {
1142     Opc = Intrinsic::aarch64_neon_uaddlp;
1143     HAddTys.push_back(LLT::fixed_vector(4, 16));
1144   } else if (Ty == LLT::fixed_vector(2, 32)) {
1145     Opc = Intrinsic::aarch64_neon_uaddlp;
1146     HAddTys.push_back(LLT::fixed_vector(4, 16));
1147     HAddTys.push_back(LLT::fixed_vector(2, 32));
1148   } else
1149     llvm_unreachable("unexpected vector shape");
1150   MachineInstrBuilder UADD;
1151   for (LLT HTy : HAddTys) {
1152     UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}, /*HasSideEffects =*/false)
1153                      .addUse(HSum);
1154     HSum = UADD.getReg(0);
1155   }
1156 
1157   // Post-conditioning.
1158   if (Ty.isScalar() && Size == 64)
1159     MIRBuilder.buildZExt(Dst, UADD);
1160   else
1161     UADD->getOperand(0).setReg(Dst);
1162   MI.eraseFromParent();
1163   return true;
1164 }
1165 
1166 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
1167     MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1168   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1169   LLT s64 = LLT::scalar(64);
1170   auto Addr = MI.getOperand(1).getReg();
1171   auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
1172   auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
1173   auto DstLo = MRI.createGenericVirtualRegister(s64);
1174   auto DstHi = MRI.createGenericVirtualRegister(s64);
1175 
1176   MachineInstrBuilder CAS;
1177   if (ST->hasLSE()) {
1178     // We have 128-bit CASP instructions taking XSeqPair registers, which are
1179     // s128. We need the merge/unmerge to bracket the expansion and pair up with
1180     // the rest of the MIR so we must reassemble the extracted registers into a
1181     // 128-bit known-regclass one with code like this:
1182     //
1183     //     %in1 = REG_SEQUENCE Lo, Hi    ; One for each input
1184     //     %out = CASP %in1, ...
1185     //     %OldLo = G_EXTRACT %out, 0
1186     //     %OldHi = G_EXTRACT %out, 64
1187     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1188     unsigned Opcode;
1189     switch (Ordering) {
1190     case AtomicOrdering::Acquire:
1191       Opcode = AArch64::CASPAX;
1192       break;
1193     case AtomicOrdering::Release:
1194       Opcode = AArch64::CASPLX;
1195       break;
1196     case AtomicOrdering::AcquireRelease:
1197     case AtomicOrdering::SequentiallyConsistent:
1198       Opcode = AArch64::CASPALX;
1199       break;
1200     default:
1201       Opcode = AArch64::CASPX;
1202       break;
1203     }
1204 
1205     LLT s128 = LLT::scalar(128);
1206     auto CASDst = MRI.createGenericVirtualRegister(s128);
1207     auto CASDesired = MRI.createGenericVirtualRegister(s128);
1208     auto CASNew = MRI.createGenericVirtualRegister(s128);
1209     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
1210         .addUse(DesiredI->getOperand(0).getReg())
1211         .addImm(AArch64::sube64)
1212         .addUse(DesiredI->getOperand(1).getReg())
1213         .addImm(AArch64::subo64);
1214     MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
1215         .addUse(NewI->getOperand(0).getReg())
1216         .addImm(AArch64::sube64)
1217         .addUse(NewI->getOperand(1).getReg())
1218         .addImm(AArch64::subo64);
1219 
1220     CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
1221 
1222     MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
1223     MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
1224   } else {
1225     // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
1226     // can take arbitrary registers so it just has the normal GPR64 operands the
1227     // rest of AArch64 is expecting.
1228     auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
1229     unsigned Opcode;
1230     switch (Ordering) {
1231     case AtomicOrdering::Acquire:
1232       Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
1233       break;
1234     case AtomicOrdering::Release:
1235       Opcode = AArch64::CMP_SWAP_128_RELEASE;
1236       break;
1237     case AtomicOrdering::AcquireRelease:
1238     case AtomicOrdering::SequentiallyConsistent:
1239       Opcode = AArch64::CMP_SWAP_128;
1240       break;
1241     default:
1242       Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
1243       break;
1244     }
1245 
1246     auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1247     CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
1248                                 {Addr, DesiredI->getOperand(0),
1249                                  DesiredI->getOperand(1), NewI->getOperand(0),
1250                                  NewI->getOperand(1)});
1251   }
1252 
1253   CAS.cloneMemRefs(MI);
1254   constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
1255                                    *MRI.getTargetRegisterInfo(),
1256                                    *ST->getRegBankInfo());
1257 
1258   MIRBuilder.buildMerge(MI.getOperand(0), {DstLo, DstHi});
1259   MI.eraseFromParent();
1260   return true;
1261 }
1262 
1263 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
1264                                         LegalizerHelper &Helper) const {
1265   MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1266   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1267   LLT Ty = MRI.getType(MI.getOperand(1).getReg());
1268   auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
1269   MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
1270   MI.eraseFromParent();
1271   return true;
1272 }
1273