1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
25 
26 #include "codegen/nv50_ir_target_nv50.h"
27 
28 #define NV50_SU_INFO_SIZE_X   0x00
29 #define NV50_SU_INFO_SIZE_Y   0x04
30 #define NV50_SU_INFO_SIZE_Z   0x08
31 #define NV50_SU_INFO_BSIZE    0x0c
32 #define NV50_SU_INFO_STRIDE_Y 0x10
33 #define NV50_SU_INFO_MS_X     0x18
34 #define NV50_SU_INFO_MS_Y     0x1c
35 #define NV50_SU_INFO_TILE_SHIFT_X 0x20
36 #define NV50_SU_INFO_TILE_SHIFT_Y 0x24
37 #define NV50_SU_INFO_TILE_SHIFT_Z 0x28
38 #define NV50_SU_INFO_OFFSET_Z 0x2c
39 
40 #define NV50_SU_INFO__STRIDE 0x30
41 
42 #define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
43 #define NV50_SU_INFO_MS(i)   (0x18 + (i) * 4)
44 #define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
45 
46 namespace nv50_ir {
47 
48 // nv50 doesn't support 32 bit integer multiplication
49 //
50 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
51 // -------------------
52 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
53 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
54 //       al*bl
55 //    ah*bl 00
56 //
57 // fffe0001 + fffe0001
58 //
59 // Note that this sort of splitting doesn't work for signed values, so we
60 // compute the sign on those manually and then perform an unsigned multiply.
61 static bool
expandIntegerMUL(BuildUtil * bld,Instruction * mul)62 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
63 {
64    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
65    ImmediateValue src1;
66    bool src1imm = mul->src(1).getImmediate(src1);
67 
68    DataType fTy; // full type
69    switch (mul->sType) {
70    case TYPE_S32: fTy = TYPE_U32; break;
71    case TYPE_S64: fTy = TYPE_U64; break;
72    default: fTy = mul->sType; break;
73    }
74 
75    DataType hTy; // half type
76    switch (fTy) {
77    case TYPE_U32: hTy = TYPE_U16; break;
78    case TYPE_U64: hTy = TYPE_U32; break;
79    default:
80       return false;
81    }
82    unsigned int fullSize = typeSizeof(fTy);
83    unsigned int halfSize = typeSizeof(hTy);
84 
85    Instruction *i[9];
86 
87    bld->setPosition(mul, true);
88 
89    Value *s[2];
90    Value *a[2], *b[2];
91    Value *t[4];
92    for (int j = 0; j < 4; ++j)
93       t[j] = bld->getSSA(fullSize);
94 
95    if (isSignedType(mul->sType) && highResult) {
96       s[0] = bld->getSSA(fullSize);
97       s[1] = bld->getSSA(fullSize);
98       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
99       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
100       src1.reg.data.s32 = abs(src1.reg.data.s32);
101    } else {
102       s[0] = mul->getSrc(0);
103       s[1] = mul->getSrc(1);
104    }
105 
106    // split sources into halves
107    i[0] = bld->mkSplit(a, halfSize, s[0]);
108    i[1] = bld->mkSplit(b, halfSize, s[1]);
109 
110    if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
111       i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
112                                bld->mkImm(src1.reg.data.u32 & 0xffff));
113    } else {
114       i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
115                         src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
116       if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
117          i[3] = i[2];
118          t[1] = t[0];
119       } else {
120          i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
121       }
122    }
123    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
124    if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
125       i[4] = i[3];
126       t[3] = t[2];
127    } else {
128       i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
129    }
130 
131    if (highResult) {
132       Value *c[2];
133       Value *r[5];
134       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
135       c[0] = bld->getSSA(1, FILE_FLAGS);
136       c[1] = bld->getSSA(1, FILE_FLAGS);
137       for (int j = 0; j < 5; ++j)
138          r[j] = bld->getSSA(fullSize);
139 
140       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
141       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
142       bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
143       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
144       i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
145 
146       // set carry defs / sources
147       i[3]->setFlagsDef(1, c[0]);
148       // actual result required in negative case, but ignored for
149       // unsigned. for some reason the compiler ends up dropping the whole
150       // instruction if the destination is unused but the flags are.
151       if (isSignedType(mul->sType))
152          i[4]->setFlagsDef(1, c[1]);
153       else
154          i[4]->setFlagsDef(0, c[1]);
155       i[6]->setPredicate(CC_C, c[0]);
156       i[5]->setFlagsSrc(3, c[1]);
157 
158       if (isSignedType(mul->sType)) {
159          Value *cc[2];
160          Value *rr[7];
161          Value *one = bld->getSSA(fullSize);
162          bld->loadImm(one, 1);
163          for (int j = 0; j < 7; j++)
164             rr[j] = bld->getSSA(fullSize);
165 
166          // NOTE: this logic uses predicates because splitting basic blocks is
167          // ~impossible during the SSA phase. The RA relies on a correlation
168          // between edge order and phi node sources.
169 
170          // Set the sign of the result based on the inputs
171          bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
172             ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
173 
174          // 1s complement of 64-bit value
175          bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
176             ->setPredicate(CC_S, cc[0]);
177          bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
178             ->setPredicate(CC_S, cc[0]);
179 
180          // add to low 32-bits, keep track of the carry
181          Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
182          n->setPredicate(CC_S, cc[0]);
183          n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
184 
185          // If there was a carry, add 1 to the upper 32 bits
186          // XXX: These get executed even if they shouldn't be
187          bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
188             ->setPredicate(CC_C, cc[1]);
189          bld->mkMov(rr[3], rr[0])
190             ->setPredicate(CC_NC, cc[1]);
191          bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
192 
193          // Merge the results from the negative and non-negative paths
194          bld->mkMov(rr[5], rr[4])
195             ->setPredicate(CC_S, cc[0]);
196          bld->mkMov(rr[6], r[4])
197             ->setPredicate(CC_NS, cc[0]);
198          bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
199       } else {
200          bld->mkMov(mul->getDef(0), r[4]);
201       }
202    } else {
203       bld->mkMov(mul->getDef(0), t[3]);
204    }
205    delete_Instruction(bld->getProgram(), mul);
206 
207    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
208       if (i[j])
209          i[j]->sType = hTy;
210 
211    return true;
212 }
213 
214 #define QOP_ADD  0
215 #define QOP_SUBR 1
216 #define QOP_SUB  2
217 #define QOP_MOV2 3
218 
219 //             UL UR LL LR
220 #define QUADOP(q, r, s, t)            \
221    ((QOP_##q << 6) | (QOP_##r << 4) | \
222     (QOP_##s << 2) | (QOP_##t << 0))
223 
224 class NV50LegalizePostRA : public Pass
225 {
226 public:
NV50LegalizePostRA()227    NV50LegalizePostRA() : r63(NULL) { }
228 
229 private:
230    virtual bool visit(Function *);
231    virtual bool visit(BasicBlock *);
232 
233    void handlePRERET(FlowInstruction *);
234    void replaceZero(Instruction *);
235 
236    BuildUtil bld;
237 
238    LValue *r63;
239 };
240 
241 bool
visit(Function * fn)242 NV50LegalizePostRA::visit(Function *fn)
243 {
244    Program *prog = fn->getProgram();
245 
246    r63 = new_LValue(fn, FILE_GPR);
247    // GPR units on nv50 are in half-regs
248    if (prog->maxGPR < 126)
249       r63->reg.data.id = 63;
250    else
251       r63->reg.data.id = 127;
252 
253    // this is actually per-program, but we can do it all on visiting main()
254    std::list<Instruction *> *outWrites =
255       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
256 
257    if (outWrites) {
258       for (std::list<Instruction *>::iterator it = outWrites->begin();
259            it != outWrites->end(); ++it)
260          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
261       // instructions will be deleted on exit
262       outWrites->clear();
263    }
264 
265    return true;
266 }
267 
268 void
replaceZero(Instruction * i)269 NV50LegalizePostRA::replaceZero(Instruction *i)
270 {
271    for (int s = 0; i->srcExists(s); ++s) {
272       ImmediateValue *imm = i->getSrc(s)->asImm();
273       if (imm && imm->reg.data.u64 == 0)
274          i->setSrc(s, r63);
275    }
276 }
277 
278 // Emulate PRERET: jump to the target and call to the origin from there
279 //
280 // WARNING: atm only works if BBs are affected by at most a single PRERET
281 //
282 // BB:0
283 // preret BB:3
284 // (...)
285 // BB:3
286 // (...)
287 //             --->
288 // BB:0
289 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
290 // (...)
291 // BB:3
292 // bra BB:3 + n1 (skip the call)
293 // call BB:0 + n2 (skip bra at beginning of BB:0)
294 // (...)
295 void
handlePRERET(FlowInstruction * pre)296 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
297 {
298    BasicBlock *bbE = pre->bb;
299    BasicBlock *bbT = pre->target.bb;
300 
301    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
302    bbE->remove(pre);
303    bbE->insertHead(pre);
304 
305    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
306    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
307 
308    bbT->insertHead(call);
309    bbT->insertHead(skip);
310 
311    // NOTE: maybe split blocks to prevent the instructions from moving ?
312 
313    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
314    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
315 }
316 
317 bool
visit(BasicBlock * bb)318 NV50LegalizePostRA::visit(BasicBlock *bb)
319 {
320    Instruction *i, *next;
321 
322    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
323    for (i = bb->getFirst(); i; i = next) {
324       next = i->next;
325       if (i->isNop()) {
326          bb->remove(i);
327       } else
328       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
329          handlePRERET(i->asFlow());
330       } else {
331          // TODO: We will want to do this before register allocation,
332          // since have to use a $c register for the carry flag.
333          if (typeSizeof(i->dType) == 8) {
334             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
335             if (hi)
336                next = hi;
337          }
338 
339          if (i->op != OP_PFETCH && i->op != OP_BAR &&
340              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
341             replaceZero(i);
342       }
343    }
344    if (!bb->getEntry())
345       return true;
346 
347    return true;
348 }
349 
350 class NV50LegalizeSSA : public Pass
351 {
352 public:
353    NV50LegalizeSSA(Program *);
354 
355    virtual bool visit(BasicBlock *bb);
356 
357 private:
358    void propagateWriteToOutput(Instruction *);
359    void handleDIV(Instruction *);
360    void handleMOD(Instruction *);
361    void handleMUL(Instruction *);
362    void handleAddrDef(Instruction *);
363 
364    inline bool isARL(const Instruction *) const;
365 
366    BuildUtil bld;
367 
368    std::list<Instruction *> *outWrites;
369 };
370 
NV50LegalizeSSA(Program * prog)371 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
372 {
373    bld.setProgram(prog);
374 
375    if (prog->optLevel >= 2 &&
376        (prog->getType() == Program::TYPE_GEOMETRY ||
377         prog->getType() == Program::TYPE_VERTEX))
378       outWrites =
379          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
380    else
381       outWrites = NULL;
382 }
383 
384 void
propagateWriteToOutput(Instruction * st)385 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
386 {
387    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
388       return;
389 
390    // check def instruction can store
391    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
392 
393    // TODO: move exports (if beneficial) in common opt pass
394    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
395       return;
396 
397    for (int s = 0; di->srcExists(s); ++s)
398       if (di->src(s).getFile() == FILE_IMMEDIATE ||
399           di->src(s).getFile() == FILE_MEMORY_LOCAL)
400          return;
401 
402    if (prog->getType() == Program::TYPE_GEOMETRY) {
403       // Only propagate output writes in geometry shaders when we can be sure
404       // that we are propagating to the same output vertex.
405       if (di->bb != st->bb)
406          return;
407       Instruction *i;
408       for (i = di; i != st; i = i->next) {
409          if (i->op == OP_EMIT || i->op == OP_RESTART)
410             return;
411       }
412       assert(i); // st after di
413    }
414 
415    // We cannot set defs to non-lvalues before register allocation, so
416    // save & remove (to save registers) the exports and replace later.
417    outWrites->push_back(st);
418    st->bb->remove(st);
419 }
420 
421 bool
isARL(const Instruction * i) const422 NV50LegalizeSSA::isARL(const Instruction *i) const
423 {
424    ImmediateValue imm;
425 
426    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
427       return false;
428    if (!i->src(1).getImmediate(imm))
429       return false;
430    return imm.isInteger(0);
431 }
432 
433 void
handleAddrDef(Instruction * i)434 NV50LegalizeSSA::handleAddrDef(Instruction *i)
435 {
436    Instruction *arl;
437 
438    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
439 
440    // PFETCH can always write to $a
441    if (i->op == OP_PFETCH)
442       return;
443    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
444    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
445       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
446          return;
447       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
448          return;
449    }
450 
451    // turn $a sources into $r sources (can't operate on $a)
452    for (int s = 0; i->srcExists(s); ++s) {
453       Value *a = i->getSrc(s);
454       Value *r;
455       if (a->reg.file == FILE_ADDRESS) {
456          if (a->getInsn() && isARL(a->getInsn())) {
457             i->setSrc(s, a->getInsn()->getSrc(0));
458          } else {
459             bld.setPosition(i, false);
460             r = bld.getSSA();
461             bld.mkMov(r, a);
462             i->setSrc(s, r);
463          }
464       }
465    }
466    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
467       return;
468 
469    // turn result back into $a
470    bld.setPosition(i, true);
471    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
472    i->setDef(0, arl->getSrc(0));
473 }
474 
475 void
handleMUL(Instruction * mul)476 NV50LegalizeSSA::handleMUL(Instruction *mul)
477 {
478    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
479       return;
480    Value *def = mul->getDef(0);
481    Value *pred = mul->getPredicate();
482    CondCode cc = mul->cc;
483    if (pred)
484       mul->setPredicate(CC_ALWAYS, NULL);
485 
486    if (mul->op == OP_MAD) {
487       Instruction *add = mul;
488       bld.setPosition(add, false);
489       Value *res = cloneShallow(func, mul->getDef(0));
490       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
491       add->op = OP_ADD;
492       add->setSrc(0, mul->getDef(0));
493       add->setSrc(1, add->getSrc(2));
494       for (int s = 2; add->srcExists(s); ++s)
495          add->setSrc(s, NULL);
496       mul->subOp = add->subOp;
497       add->subOp = 0;
498    }
499    expandIntegerMUL(&bld, mul);
500    if (pred)
501       def->getInsn()->setPredicate(cc, pred);
502 }
503 
504 // Use f32 division: first compute an approximate result, use it to reduce
505 // the dividend, which should then be representable as f32, divide the reduced
506 // dividend, and add the quotients.
507 void
handleDIV(Instruction * div)508 NV50LegalizeSSA::handleDIV(Instruction *div)
509 {
510    const DataType ty = div->sType;
511 
512    if (ty != TYPE_U32 && ty != TYPE_S32)
513       return;
514 
515    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
516 
517    bld.setPosition(div, false);
518 
519    Value *a, *af = bld.getSSA();
520    Value *b, *bf = bld.getSSA();
521 
522    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
523    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
524 
525    if (isSignedType(ty)) {
526       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
527       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
528       a = bld.getSSA();
529       b = bld.getSSA();
530       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
531       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
532    } else {
533       a = div->getSrc(0);
534       b = div->getSrc(1);
535    }
536 
537    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
538    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
539 
540    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
541    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
542 
543    // get error of 1st result
544    expandIntegerMUL(&bld,
545       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
546    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
547 
548    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
549 
550    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
551    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
552       ->rnd = ROUND_Z;
553    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
554 
555    // correction: if modulus >= divisor, add 1
556    expandIntegerMUL(&bld,
557       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
558    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
559    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
560    if (!isSignedType(ty)) {
561       div->op = OP_SUB;
562       div->setSrc(0, q);
563       div->setSrc(1, s);
564    } else {
565       t = q;
566       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
567       s = bld.getSSA();
568       t = bld.getSSA();
569       // fix the sign
570       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
571          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
572       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
573       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
574 
575       div->op = OP_UNION;
576       div->setSrc(0, s);
577       div->setSrc(1, t);
578    }
579 }
580 
581 void
handleMOD(Instruction * mod)582 NV50LegalizeSSA::handleMOD(Instruction *mod)
583 {
584    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
585       return;
586    bld.setPosition(mod, false);
587 
588    Value *q = bld.getSSA();
589    Value *m = bld.getSSA();
590 
591    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
592    handleDIV(q->getInsn());
593 
594    bld.setPosition(mod, false);
595    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
596 
597    mod->op = OP_SUB;
598    mod->setSrc(1, m);
599 }
600 
601 bool
visit(BasicBlock * bb)602 NV50LegalizeSSA::visit(BasicBlock *bb)
603 {
604    Instruction *insn, *next;
605    // skipping PHIs (don't pass them to handleAddrDef) !
606    for (insn = bb->getEntry(); insn; insn = next) {
607       next = insn->next;
608 
609       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
610          handleAddrDef(insn);
611 
612       switch (insn->op) {
613       case OP_EXPORT:
614          if (outWrites)
615             propagateWriteToOutput(insn);
616          break;
617       case OP_DIV:
618          handleDIV(insn);
619          break;
620       case OP_MOD:
621          handleMOD(insn);
622          break;
623       case OP_MAD:
624       case OP_MUL:
625          handleMUL(insn);
626          break;
627       default:
628          break;
629       }
630    }
631    return true;
632 }
633 
634 class NV50LoweringPreSSA : public Pass
635 {
636 public:
637    NV50LoweringPreSSA(Program *);
638 
639 private:
640    virtual bool visit(Instruction *);
641    virtual bool visit(Function *);
642 
643    bool handleRDSV(Instruction *);
644    bool handleWRSV(Instruction *);
645 
646    bool handlePFETCH(Instruction *);
647    bool handleEXPORT(Instruction *);
648    bool handleLOAD(Instruction *);
649    bool handleLDST(Instruction *);
650    bool handleMEMBAR(Instruction *);
651    bool handleSharedATOM(Instruction *);
652    bool handleSULDP(TexInstruction *);
653    bool handleSUREDP(TexInstruction *);
654    bool handleSUSTP(TexInstruction *);
655    Value *processSurfaceCoords(TexInstruction *);
656 
657    bool handleDIV(Instruction *);
658    bool handleSQRT(Instruction *);
659    bool handlePOW(Instruction *);
660 
661    bool handleSET(Instruction *);
662    bool handleSLCT(CmpInstruction *);
663    bool handleSELP(Instruction *);
664 
665    bool handleTEX(TexInstruction *);
666    bool handleTXB(TexInstruction *); // I really
667    bool handleTXL(TexInstruction *); // hate
668    bool handleTXD(TexInstruction *); // these 3
669    bool handleTXLQ(TexInstruction *);
670    bool handleTXQ(TexInstruction *);
671    bool handleSUQ(TexInstruction *);
672    bool handleBUFQ(Instruction *);
673 
674    bool handleCALL(Instruction *);
675    bool handlePRECONT(Instruction *);
676    bool handleCONT(Instruction *);
677 
678    void checkPredicate(Instruction *);
679    void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
680    void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
681    Value *loadSuInfo(int slot, uint32_t off);
682    Value *loadSuInfo16(int slot, uint32_t off);
683 
684 private:
685    const Target *const targ;
686 
687    BuildUtil bld;
688 
689    Value *tid;
690 };
691 
NV50LoweringPreSSA(Program * prog)692 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
693    targ(prog->getTarget()), tid(NULL)
694 {
695    bld.setProgram(prog);
696 }
697 
698 bool
visit(Function * f)699 NV50LoweringPreSSA::visit(Function *f)
700 {
701    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
702 
703    if (prog->getType() == Program::TYPE_COMPUTE) {
704       // Add implicit "thread id" argument in $r0 to the function
705       Value *arg = new_LValue(func, FILE_GPR);
706       arg->reg.data.id = 0;
707       f->ins.push_back(arg);
708 
709       bld.setPosition(root, false);
710       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
711    }
712 
713    return true;
714 }
715 
loadTexMsInfo(uint32_t off,Value ** ms,Value ** ms_x,Value ** ms_y)716 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
717                                        Value **ms_x, Value **ms_y) {
718    // This loads the texture-indexed ms setting from the constant buffer
719    Value *tmp = new_LValue(func, FILE_GPR);
720    uint8_t b = prog->driver->io.auxCBSlot;
721    off += prog->driver->io.suInfoBase;
722    if (prog->getType() > Program::TYPE_VERTEX)
723       off += 16 * 2 * 4;
724    if (prog->getType() > Program::TYPE_GEOMETRY)
725       off += 16 * 2 * 4;
726    if (prog->getType() > Program::TYPE_FRAGMENT)
727       off += 16 * 2 * 4;
728    *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
729                              FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
730    *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
731                              FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
732    *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
733 }
734 
loadMsInfo(Value * ms,Value * s,Value ** dx,Value ** dy)735 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
736    // Given a MS level, and a sample id, compute the delta x/y
737    uint8_t b = prog->driver->io.msInfoCBSlot;
738    Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
739 
740    // The required information is at mslevel * 16 * 4 + sample * 8
741    // = (mslevel * 8 + sample) * 8
742    bld.mkOp2(OP_SHL,
743              TYPE_U32,
744              off,
745              bld.mkOp2v(OP_ADD, TYPE_U32, t,
746                         bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
747                         s),
748              bld.mkImm(3));
749    *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
750                            FILE_MEMORY_CONST, b, TYPE_U32,
751                            prog->driver->io.msInfoBase), off);
752    *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
753                            FILE_MEMORY_CONST, b, TYPE_U32,
754                            prog->driver->io.msInfoBase + 4), off);
755 }
756 
757 Value *
loadSuInfo(int slot,uint32_t off)758 NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
759 {
760    uint8_t b = prog->driver->io.auxCBSlot;
761    off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
762    return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
763                             FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
764 }
765 
766 Value *
loadSuInfo16(int slot,uint32_t off)767 NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
768 {
769    uint8_t b = prog->driver->io.auxCBSlot;
770    off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
771    return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
772                             FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
773 }
774 
775 bool
handleTEX(TexInstruction * i)776 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
777 {
778    const int arg = i->tex.target.getArgCount();
779    const int dref = arg;
780    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
781 
782    /* Only normalize in the non-explicit derivatives case.
783     */
784    if (i->tex.target.isCube() && i->op != OP_TXD) {
785       Value *src[3], *val;
786       int c;
787       for (c = 0; c < 3; ++c)
788          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
789       val = bld.getScratch();
790       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
791       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
792       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
793       for (c = 0; c < 3; ++c) {
794          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
795                                  i->getSrc(c), val));
796       }
797    }
798 
799    // handle MS, which means looking up the MS params for this texture, and
800    // adjusting the input coordinates to point at the right sample.
801    if (i->tex.target.isMS()) {
802       Value *x = i->getSrc(0);
803       Value *y = i->getSrc(1);
804       Value *s = i->getSrc(arg - 1);
805       Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
806          *ms, *ms_x, *ms_y, *dx, *dy;
807 
808       i->tex.target.clearMS();
809 
810       loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
811       loadMsInfo(ms, s, &dx, &dy);
812 
813       bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
814       bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
815       bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
816       bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
817       i->setSrc(0, tx);
818       i->setSrc(1, ty);
819       i->setSrc(arg - 1, bld.loadImm(NULL, 0));
820    }
821 
822    // dref comes before bias/lod
823    if (i->tex.target.isShadow())
824       if (i->op == OP_TXB || i->op == OP_TXL)
825          i->swapSources(dref, lod);
826 
827    if (i->tex.target.isArray()) {
828       if (i->op != OP_TXF) {
829          // array index must be converted to u32, but it's already an integer
830          // for TXF
831          Value *layer = i->getSrc(arg - 1);
832          LValue *src = new_LValue(func, FILE_GPR);
833          bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
834          bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
835          i->setSrc(arg - 1, src);
836       }
837       if (i->tex.target.isCube() && i->srcCount() > 4) {
838          std::vector<Value *> acube, a2d;
839          int c;
840 
841          acube.resize(4);
842          for (c = 0; c < 4; ++c)
843             acube[c] = i->getSrc(c);
844          a2d.resize(4);
845          for (c = 0; c < 3; ++c)
846             a2d[c] = new_LValue(func, FILE_GPR);
847          a2d[3] = NULL;
848 
849          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
850                    a2d, acube)->asTex()->tex.mask = 0x7;
851 
852          for (c = 0; c < 3; ++c)
853             i->setSrc(c, a2d[c]);
854          for (; i->srcExists(c + 1); ++c)
855             i->setSrc(c, i->getSrc(c + 1));
856          i->setSrc(c, NULL);
857          assert(c <= 4);
858 
859          i->tex.target = i->tex.target.isShadow() ?
860             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
861       }
862    }
863 
864    // texel offsets are 3 immediate fields in the instruction,
865    // nv50 cannot do textureGatherOffsets
866    assert(i->tex.useOffsets <= 1);
867    if (i->tex.useOffsets) {
868       for (int c = 0; c < 3; ++c) {
869          ImmediateValue val;
870          if (!i->offset[0][c].getImmediate(val))
871             assert(!"non-immediate offset");
872          i->tex.offset[c] = val.reg.data.u32;
873          i->offset[0][c].set(NULL);
874       }
875    }
876 
877    return true;
878 }
879 
880 // Bias must be equal for all threads of a quad or lod calculation will fail.
881 //
882 // The lanes of a quad are grouped by the bit in the condition register they
883 // have set, which is selected by differing bias values.
884 // Move the input values for TEX into a new register set for each group and
885 // execute TEX only for a specific group.
886 // We always need to use 4 new registers for the inputs/outputs because the
887 // implicitly calculated derivatives must be correct.
888 //
889 // TODO: move to SSA phase so we can easily determine whether bias is constant
890 bool
handleTXB(TexInstruction * i)891 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
892 {
893    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
894    int l, d;
895 
896    // We can't actually apply bias *and* do a compare for a cube
897    // texture. Since the compare has to be done before the filtering, just
898    // drop the bias on the floor.
899    if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
900       i->op = OP_TEX;
901       i->setSrc(3, i->getSrc(4));
902       i->setSrc(4, NULL);
903       return handleTEX(i);
904    }
905 
906    handleTEX(i);
907    Value *bias = i->getSrc(i->tex.target.getArgCount());
908    if (bias->isUniform())
909       return true;
910 
911    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
912                                  bld.loadImm(NULL, 1));
913    bld.setPosition(cond, false);
914 
915    for (l = 1; l < 4; ++l) {
916       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
917       Value *bit = bld.getSSA();
918       Value *pred = bld.getScratch(1, FILE_FLAGS);
919       Value *imm = bld.loadImm(NULL, (1 << l));
920       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
921       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
922       cond->setSrc(l, bit);
923    }
924    Value *flags = bld.getScratch(1, FILE_FLAGS);
925    bld.setPosition(cond, true);
926    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
927 
928    Instruction *tex[4];
929    for (l = 0; l < 4; ++l) {
930       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
931       bld.insert(tex[l]);
932    }
933 
934    Value *res[4][4];
935    for (d = 0; i->defExists(d); ++d)
936       res[0][d] = tex[0]->getDef(d);
937    for (l = 1; l < 4; ++l) {
938       for (d = 0; tex[l]->defExists(d); ++d) {
939          res[l][d] = cloneShallow(func, res[0][d]);
940          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
941       }
942    }
943 
944    for (d = 0; i->defExists(d); ++d) {
945       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
946       for (l = 0; l < 4; ++l)
947          dst->setSrc(l, res[l][d]);
948    }
949    delete_Instruction(prog, i);
950    return true;
951 }
952 
953 // LOD must be equal for all threads of a quad.
954 // Unlike with TXB, here we can just diverge since there's no LOD calculation
955 // that would require all 4 threads' sources to be set up properly.
956 bool
handleTXL(TexInstruction * i)957 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
958 {
959    handleTEX(i);
960    Value *lod = i->getSrc(i->tex.target.getArgCount());
961    if (lod->isUniform())
962       return true;
963 
964    BasicBlock *currBB = i->bb;
965    BasicBlock *texiBB = i->bb->splitBefore(i, false);
966    BasicBlock *joinBB = i->bb->splitAfter(i);
967 
968    bld.setPosition(currBB, true);
969    assert(!currBB->joinAt);
970    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
971 
972    for (int l = 0; l <= 3; ++l) {
973       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
974       Value *pred = bld.getScratch(1, FILE_FLAGS);
975       bld.setPosition(currBB, true);
976       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
977       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
978       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
979       if (l <= 2) {
980          BasicBlock *laneBB = new BasicBlock(func);
981          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
982          currBB = laneBB;
983       }
984    }
985    bld.setPosition(joinBB, false);
986    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
987    return true;
988 }
989 
990 bool
handleTXD(TexInstruction * i)991 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
992 {
993    static const uint8_t qOps[4][2] =
994    {
995       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
996       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
997       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
998       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
999    };
1000    Value *def[4][4];
1001    Value *crd[3];
1002    Instruction *tex;
1003    Value *zero = bld.loadImm(bld.getSSA(), 0);
1004    int l, c;
1005    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1006 
1007    handleTEX(i);
1008    i->op = OP_TEX; // no need to clone dPdx/dPdy later
1009    i->tex.derivAll = true;
1010 
1011    for (c = 0; c < dim; ++c)
1012       crd[c] = bld.getScratch();
1013 
1014    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1015    for (l = 0; l < 4; ++l) {
1016       Value *src[3], *val;
1017       // mov coordinates from lane l to all lanes
1018       for (c = 0; c < dim; ++c)
1019          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
1020       // add dPdx from lane l to lanes dx
1021       for (c = 0; c < dim; ++c)
1022          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
1023       // add dPdy from lane l to lanes dy
1024       for (c = 0; c < dim; ++c)
1025          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
1026       // normalize cube coordinates if necessary
1027       if (i->tex.target.isCube()) {
1028          for (c = 0; c < 3; ++c)
1029             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1030          val = bld.getScratch();
1031          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1032          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1033          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1034          for (c = 0; c < 3; ++c)
1035             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1036       } else {
1037          for (c = 0; c < dim; ++c)
1038             src[c] = crd[c];
1039       }
1040       // texture
1041       bld.insert(tex = cloneForward(func, i));
1042       for (c = 0; c < dim; ++c)
1043          tex->setSrc(c, src[c]);
1044       // save results
1045       for (c = 0; i->defExists(c); ++c) {
1046          Instruction *mov;
1047          def[c][l] = bld.getSSA();
1048          mov = bld.mkMov(def[c][l], tex->getDef(c));
1049          mov->fixed = 1;
1050          mov->lanes = 1 << l;
1051       }
1052    }
1053    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1054 
1055    for (c = 0; i->defExists(c); ++c) {
1056       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1057       for (l = 0; l < 4; ++l)
1058          u->setSrc(l, def[c][l]);
1059    }
1060 
1061    i->bb->remove(i);
1062    return true;
1063 }
1064 
1065 bool
handleTXLQ(TexInstruction * i)1066 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
1067 {
1068    handleTEX(i);
1069    bld.setPosition(i, true);
1070 
1071    /* The returned values are not quite what we want:
1072     * (a) convert from s32 to f32
1073     * (b) multiply by 1/256
1074     */
1075    for (int def = 0; def < 2; ++def) {
1076       if (!i->defExists(def))
1077          continue;
1078       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
1079       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1080                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1081    }
1082    return true;
1083 }
1084 
1085 bool
handleTXQ(TexInstruction * i)1086 NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
1087 {
1088    Value *ms, *ms_x, *ms_y;
1089    if (i->tex.query == TXQ_DIMS) {
1090       if (i->tex.target.isMS()) {
1091          bld.setPosition(i, true);
1092          loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1093          int d = 0;
1094          if (i->tex.mask & 1) {
1095             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x);
1096             d++;
1097          }
1098          if (i->tex.mask & 2) {
1099             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y);
1100             d++;
1101          }
1102       }
1103       return true;
1104    }
1105    assert(i->tex.query == TXQ_TYPE);
1106    assert(i->tex.mask == 4);
1107 
1108    loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1109    bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
1110    i->bb->remove(i);
1111 
1112    return true;
1113 }
1114 
1115 bool
handleSUQ(TexInstruction * suq)1116 NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
1117 {
1118    const int dim = suq->tex.target.getDim();
1119    const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1120    int mask = suq->tex.mask;
1121    int slot = suq->tex.r;
1122    int c, d;
1123 
1124    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1125       if (c >= arg || !(mask & 1))
1126          continue;
1127 
1128       int offset;
1129 
1130       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1131          offset = NV50_SU_INFO_SIZE(2);
1132       } else {
1133          offset = NV50_SU_INFO_SIZE(c);
1134       }
1135       bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
1136       if (c == 2 && suq->tex.target.isCube())
1137          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1138                    bld.loadImm(NULL, 6));
1139    }
1140 
1141    if (mask & 1) {
1142       if (suq->tex.target.isMS()) {
1143          Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
1144          Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
1145          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1146          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1147       } else {
1148          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1149       }
1150    }
1151 
1152    bld.remove(suq);
1153    return true;
1154 }
1155 
1156 bool
handleBUFQ(Instruction * bufq)1157 NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
1158 {
1159    bufq->op = OP_MOV;
1160    bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
1161    bufq->setIndirect(0, 0, NULL);
1162    bufq->setIndirect(0, 1, NULL);
1163    return true;
1164 }
1165 
1166 bool
handleSET(Instruction * i)1167 NV50LoweringPreSSA::handleSET(Instruction *i)
1168 {
1169    if (i->dType == TYPE_F32) {
1170       bld.setPosition(i, true);
1171       i->dType = TYPE_U32;
1172       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1173       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1174    }
1175    return true;
1176 }
1177 
1178 bool
handleSLCT(CmpInstruction * i)1179 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1180 {
1181    Value *src0 = bld.getSSA();
1182    Value *src1 = bld.getSSA();
1183    Value *pred = bld.getScratch(1, FILE_FLAGS);
1184 
1185    Value *v0 = i->getSrc(0);
1186    Value *v1 = i->getSrc(1);
1187    // XXX: these probably shouldn't be immediates in the first place ...
1188    if (v0->asImm())
1189       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1190    if (v1->asImm())
1191       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1192 
1193    bld.setPosition(i, true);
1194    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1195    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1196    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1197 
1198    bld.setPosition(i, false);
1199    i->op = OP_SET;
1200    i->setFlagsDef(0, pred);
1201    i->dType = TYPE_U8;
1202    i->setSrc(0, i->getSrc(2));
1203    i->setSrc(2, NULL);
1204    i->setSrc(1, bld.loadImm(NULL, 0));
1205 
1206    return true;
1207 }
1208 
1209 bool
handleSELP(Instruction * i)1210 NV50LoweringPreSSA::handleSELP(Instruction *i)
1211 {
1212    Value *src0 = bld.getSSA();
1213    Value *src1 = bld.getSSA();
1214 
1215    Value *v0 = i->getSrc(0);
1216    Value *v1 = i->getSrc(1);
1217    if (v0->asImm())
1218       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1219    if (v1->asImm())
1220       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1221 
1222    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1223    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1224    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1225    delete_Instruction(prog, i);
1226    return true;
1227 }
1228 
1229 bool
handleWRSV(Instruction * i)1230 NV50LoweringPreSSA::handleWRSV(Instruction *i)
1231 {
1232    Symbol *sym = i->getSrc(0)->asSym();
1233 
1234    // these are all shader outputs, $sreg are not writeable
1235    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1236    if (addr >= 0x400)
1237       return false;
1238    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1239 
1240    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1241 
1242    bld.getBB()->remove(i);
1243    return true;
1244 }
1245 
1246 bool
handleCALL(Instruction * i)1247 NV50LoweringPreSSA::handleCALL(Instruction *i)
1248 {
1249    if (prog->getType() == Program::TYPE_COMPUTE) {
1250       // Add implicit "thread id" argument in $r0 to the function
1251       i->setSrc(i->srcCount(), tid);
1252    }
1253    return true;
1254 }
1255 
1256 bool
handlePRECONT(Instruction * i)1257 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1258 {
1259    delete_Instruction(prog, i);
1260    return true;
1261 }
1262 
1263 bool
handleCONT(Instruction * i)1264 NV50LoweringPreSSA::handleCONT(Instruction *i)
1265 {
1266    i->op = OP_BRA;
1267    return true;
1268 }
1269 
1270 bool
handleRDSV(Instruction * i)1271 NV50LoweringPreSSA::handleRDSV(Instruction *i)
1272 {
1273    Symbol *sym = i->getSrc(0)->asSym();
1274    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1275    Value *def = i->getDef(0);
1276    SVSemantic sv = sym->reg.data.sv.sv;
1277    int idx = sym->reg.data.sv.index;
1278 
1279    if (addr >= 0x400) // mov $sreg
1280       return true;
1281 
1282    switch (sv) {
1283    case SV_POSITION:
1284       assert(prog->getType() == Program::TYPE_FRAGMENT);
1285       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1286       break;
1287    case SV_FACE:
1288       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1289       if (i->dType == TYPE_F32) {
1290          bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1291          bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1292          bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1293       }
1294       break;
1295    case SV_NCTAID:
1296    case SV_CTAID:
1297    case SV_NTID: {
1298       Value *x = bld.getSSA(2);
1299       bld.mkOp1(OP_LOAD, TYPE_U16, x,
1300                 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1301       bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1302       break;
1303    }
1304    case SV_TID:
1305       if (idx == 0) {
1306          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1307       } else if (idx == 1) {
1308          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1309          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1310       } else if (idx == 2) {
1311          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1312       } else {
1313          bld.mkMov(def, bld.mkImm(0));
1314       }
1315       break;
1316    case SV_COMBINED_TID:
1317       bld.mkMov(def, tid);
1318       break;
1319    case SV_SAMPLE_POS: {
1320       Value *off = new_LValue(func, FILE_ADDRESS);
1321       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1322       bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1323       bld.mkLoad(TYPE_F32,
1324                  def,
1325                  bld.mkSymbol(
1326                        FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
1327                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1328                  off);
1329       break;
1330    }
1331    case SV_THREAD_KILL:
1332       // Not actually supported. But it's implementation-dependent, so we can
1333       // always just say it's not a helper.
1334       bld.mkMov(def, bld.loadImm(NULL, 0));
1335       break;
1336    default:
1337       bld.mkFetch(i->getDef(0), i->dType,
1338                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1339       break;
1340    }
1341    bld.getBB()->remove(i);
1342    return true;
1343 }
1344 
1345 bool
handleDIV(Instruction * i)1346 NV50LoweringPreSSA::handleDIV(Instruction *i)
1347 {
1348    if (!isFloatType(i->dType))
1349       return true;
1350    bld.setPosition(i, false);
1351    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1352    i->op = OP_MUL;
1353    i->setSrc(1, rcp->getDef(0));
1354    return true;
1355 }
1356 
1357 bool
handleSQRT(Instruction * i)1358 NV50LoweringPreSSA::handleSQRT(Instruction *i)
1359 {
1360    bld.setPosition(i, true);
1361    i->op = OP_RSQ;
1362    bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
1363 
1364    return true;
1365 }
1366 
1367 bool
handlePOW(Instruction * i)1368 NV50LoweringPreSSA::handlePOW(Instruction *i)
1369 {
1370    LValue *val = bld.getScratch();
1371 
1372    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1373    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1374    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1375 
1376    i->op = OP_EX2;
1377    i->setSrc(0, val);
1378    i->setSrc(1, NULL);
1379 
1380    return true;
1381 }
1382 
1383 bool
handleEXPORT(Instruction * i)1384 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1385 {
1386    if (prog->getType() == Program::TYPE_FRAGMENT) {
1387       if (i->getIndirect(0, 0)) {
1388          // TODO: redirect to l[] here, load to GPRs at exit
1389          return false;
1390       } else {
1391          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1392 
1393          i->op = OP_MOV;
1394          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1395          i->src(0).set(i->src(1));
1396          i->setSrc(1, NULL);
1397          i->setDef(0, new_LValue(func, FILE_GPR));
1398          i->getDef(0)->reg.data.id = id;
1399 
1400          prog->maxGPR = MAX2(prog->maxGPR, id * 2);
1401       }
1402    }
1403    return true;
1404 }
1405 
1406 // Handle indirect addressing in geometry shaders:
1407 //
1408 // ld $r0 a[$a1][$a2+k] ->
1409 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1410 //
1411 bool
handleLOAD(Instruction * i)1412 NV50LoweringPreSSA::handleLOAD(Instruction *i)
1413 {
1414    ValueRef src = i->src(0);
1415    Symbol *sym = i->getSrc(0)->asSym();
1416 
1417    if (prog->getType() == Program::TYPE_COMPUTE) {
1418       if (sym->inFile(FILE_MEMORY_SHARED) ||
1419           sym->inFile(FILE_MEMORY_BUFFER) ||
1420           sym->inFile(FILE_MEMORY_GLOBAL)) {
1421          return handleLDST(i);
1422       }
1423    }
1424 
1425    if (src.isIndirect(1)) {
1426       assert(prog->getType() == Program::TYPE_GEOMETRY);
1427       Value *addr = i->getIndirect(0, 1);
1428 
1429       if (src.isIndirect(0)) {
1430          // base address is in an address register, so move to a GPR
1431          Value *base = bld.getScratch();
1432          bld.mkMov(base, addr);
1433 
1434          Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1435          Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1436          Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1437                                     i->getIndirect(0, 0), bld.mkImm(2));
1438 
1439          // Calculate final address: addr = base + attr*vstride; use 16-bit
1440          // multiplication since 32-bit would be lowered to multiple
1441          // instructions, and we only need the low 16 bits of the result
1442          Value *a[2], *b[2];
1443          bld.mkSplit(a, 2, attrib);
1444          bld.mkSplit(b, 2, vstride);
1445          Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1446                                  base);
1447 
1448          // move address from GPR into an address register
1449          addr = bld.getSSA(2, FILE_ADDRESS);
1450          bld.mkMov(addr, sum);
1451       }
1452 
1453       i->setIndirect(0, 1, NULL);
1454       i->setIndirect(0, 0, addr);
1455    }
1456 
1457    return true;
1458 }
1459 
1460 bool
handleSharedATOM(Instruction * atom)1461 NV50LoweringPreSSA::handleSharedATOM(Instruction *atom)
1462 {
1463    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1464 
1465    BasicBlock *currBB = atom->bb;
1466    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1467    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1468    BasicBlock *setAndUnlockBB = new BasicBlock(func);
1469    BasicBlock *failLockBB = new BasicBlock(func);
1470 
1471    bld.setPosition(currBB, true);
1472    assert(!currBB->joinAt);
1473    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1474 
1475    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1476    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1477 
1478    bld.setPosition(tryLockBB, true);
1479 
1480    Instruction *ld =
1481       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1482                  atom->getIndirect(0, 0));
1483    Value *locked = bld.getSSA(1, FILE_FLAGS);
1484    if (prog->getTarget()->getChipset() >= 0xa0) {
1485       ld->setFlagsDef(1, locked);
1486       ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1487    } else {
1488       bld.mkMov(locked, bld.loadImm(NULL, 2))
1489          ->flagsDef = 0;
1490    }
1491 
1492    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked);
1493    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1494    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1495    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1496 
1497    tryLockBB->cfg.detach(&joinBB->cfg);
1498    bld.remove(atom);
1499 
1500    bld.setPosition(setAndUnlockBB, true);
1501    Value *stVal;
1502    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1503       // Read the old value, and write the new one.
1504       stVal = atom->getSrc(1);
1505    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1506       CmpInstruction *set =
1507          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS),
1508                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1509 
1510       Instruction *selp =
1511          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2),
1512                    ld->getDef(0), set->getDef(0));
1513       stVal = selp->getDef(0);
1514 
1515       handleSELP(selp);
1516    } else {
1517       operation op;
1518 
1519       switch (atom->subOp) {
1520       case NV50_IR_SUBOP_ATOM_ADD:
1521          op = OP_ADD;
1522          break;
1523       case NV50_IR_SUBOP_ATOM_AND:
1524          op = OP_AND;
1525          break;
1526       case NV50_IR_SUBOP_ATOM_OR:
1527          op = OP_OR;
1528          break;
1529       case NV50_IR_SUBOP_ATOM_XOR:
1530          op = OP_XOR;
1531          break;
1532       case NV50_IR_SUBOP_ATOM_MIN:
1533          op = OP_MIN;
1534          break;
1535       case NV50_IR_SUBOP_ATOM_MAX:
1536          op = OP_MAX;
1537          break;
1538       default:
1539          assert(0);
1540          return false;
1541       }
1542 
1543       Instruction *i =
1544          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1545                    atom->getSrc(1));
1546 
1547       stVal = i->getDef(0);
1548    }
1549 
1550    Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1551                atom->getIndirect(0, 0), stVal);
1552    if (prog->getTarget()->getChipset() >= 0xa0) {
1553       store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1554    }
1555 
1556    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1557    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1558 
1559    // Loop until the lock is acquired.
1560    bld.setPosition(failLockBB, true);
1561    bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked);
1562    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1563    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1564    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1565 
1566    bld.setPosition(joinBB, false);
1567    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1568 
1569    return true;
1570 }
1571 
1572 bool
handleLDST(Instruction * i)1573 NV50LoweringPreSSA::handleLDST(Instruction *i)
1574 {
1575    ValueRef src = i->src(0);
1576    Symbol *sym = i->getSrc(0)->asSym();
1577 
1578    if (prog->getType() != Program::TYPE_COMPUTE) {
1579       return true;
1580    }
1581 
1582    // Buffers just map directly to the different global memory spaces
1583    if (sym->inFile(FILE_MEMORY_BUFFER)) {
1584       sym->reg.file = FILE_MEMORY_GLOBAL;
1585    }
1586 
1587    if (sym->inFile(FILE_MEMORY_SHARED)) {
1588 
1589       if (src.isIndirect(0)) {
1590          Value *addr = i->getIndirect(0, 0);
1591 
1592          if (!addr->inFile(FILE_ADDRESS)) {
1593             // Move address from GPR into an address register
1594             Value *new_addr = bld.getSSA(2, FILE_ADDRESS);
1595             bld.mkMov(new_addr, addr);
1596 
1597             i->setIndirect(0, 0, new_addr);
1598          }
1599       }
1600 
1601       if (i->op == OP_ATOM)
1602          handleSharedATOM(i);
1603    } else if (sym->inFile(FILE_MEMORY_GLOBAL)) {
1604       // All global access must be indirect. There are no instruction forms
1605       // with direct access.
1606       Value *addr = i->getIndirect(0, 0);
1607 
1608       Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset);
1609       Value *sum;
1610       if (addr != NULL)
1611          sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr,
1612                           offset);
1613       else
1614          sum = offset;
1615 
1616       i->setIndirect(0, 0, sum);
1617       sym->reg.data.offset = 0;
1618    }
1619 
1620    return true;
1621 }
1622 
1623 bool
handleMEMBAR(Instruction * i)1624 NV50LoweringPreSSA::handleMEMBAR(Instruction *i)
1625 {
1626    // For global memory, apparently doing a bunch of reads at different
1627    // addresses forces things to get sufficiently flushed.
1628    if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {
1629       uint8_t b = prog->driver->io.auxCBSlot;
1630       Value *base =
1631          bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,
1632                                             prog->driver->io.membarOffset), NULL);
1633       Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));
1634       Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1635                               bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
1636                                          physid, bld.loadImm(NULL, 0x1f)),
1637                               bld.loadImm(NULL, 2));
1638       base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);
1639       Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);
1640       for (int i = 0; i < 8; i++) {
1641          if (i != 0) {
1642             base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));
1643          }
1644          bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)
1645             ->fixed = 1;
1646       }
1647    }
1648 
1649    // Both global and shared memory barriers also need a regular control bar
1650    // TODO: double-check this is the case
1651    i->op = OP_BAR;
1652    i->subOp = NV50_IR_SUBOP_BAR_SYNC;
1653    i->setSrc(0, bld.mkImm(0u));
1654    i->setSrc(1, bld.mkImm(0u));
1655 
1656    return true;
1657 }
1658 
1659 // The type that bests represents how each component can be stored when packed.
1660 static DataType
getPackedType(const TexInstruction::ImgFormatDesc * t,int c)1661 getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
1662 {
1663    switch (t->type) {
1664    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
1665    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
1666    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
1667    case UINT:
1668       return (t->bits[c] == 8 ? TYPE_U8 :
1669               (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
1670    case SINT:
1671       return (t->bits[c] == 8 ? TYPE_S8 :
1672               (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
1673    }
1674    return TYPE_NONE;
1675 }
1676 
1677 // The type that the rest of the shader expects to process this image type in.
1678 static DataType
getShaderType(const ImgType type)1679 getShaderType(const ImgType type) {
1680    switch (type) {
1681    case FLOAT:
1682    case UNORM:
1683    case SNORM:
1684       return TYPE_F32;
1685    case UINT:
1686       return TYPE_U32;
1687    case SINT:
1688       return TYPE_S32;
1689    default:
1690       assert(!"Impossible type");
1691       return TYPE_NONE;
1692    }
1693 }
1694 
1695 // Reads the raw coordinates out of the input instruction, and returns a
1696 // single-value coordinate which is what the hardware expects to receive in a
1697 // ld/st op.
1698 Value *
processSurfaceCoords(TexInstruction * su)1699 NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
1700 {
1701    const int slot = su->tex.r;
1702    const int dim = su->tex.target.getDim();
1703    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1704 
1705    const TexInstruction::ImgFormatDesc *format = su->tex.format;
1706    const uint16_t bytes = (format->bits[0] + format->bits[1] +
1707                            format->bits[2] + format->bits[3]) / 8;
1708    uint16_t shift = ffs(bytes) - 1;
1709 
1710    // Buffer sizes don't necessarily fit in 16-bit values
1711    if (su->tex.target == TEX_TARGET_BUFFER) {
1712       return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1713                         su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
1714    }
1715 
1716    // For buffers, we just need the byte offset. And for 2d buffers we want
1717    // the x coordinate in bytes as well.
1718    Value *coords[3] = {};
1719    for (int i = 0; i < arg; i++) {
1720       Value *src[2];
1721       bld.mkSplit(src, 2, su->getSrc(i));
1722       coords[i] = src[0];
1723       // For 1d-images, we want the y coord to be 0, which it will be here.
1724       if (i == 0)
1725          coords[1] = src[1];
1726    }
1727 
1728    coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1729                           coords[0], bld.loadImm(NULL, shift));
1730 
1731    if (su->tex.target.isMS()) {
1732       Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
1733       Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
1734       coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
1735       coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
1736    }
1737 
1738    // If there are more dimensions, we just want the y-offset. But that needs
1739    // to be adjusted up by the y-stride for array images.
1740    if (su->tex.target.isArray() || su->tex.target.isCube()) {
1741       Value *index = coords[dim];
1742       Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1743       Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
1744       mul->sType = TYPE_U16;
1745       Value *muls[2];
1746       bld.mkSplit(muls, 2, mul->getDef(0));
1747       if (dim > 1)
1748          coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
1749       else
1750          coords[1] = muls[0];
1751    }
1752 
1753    // 3d is special-cased. Note that a single "slice" of a 3d image may
1754    // also be attached as 2d, so we have to do the same 3d processing for
1755    // 2d as well, just in case. In order to remap a 3d image onto a 2d
1756    // image, we have to retile it "by hand".
1757    if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
1758       Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
1759       Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1760       // Add the z coordinate for actual 3d-images
1761       if (dim > 2)
1762          coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
1763       else
1764          coords[2] = z;
1765 
1766       // Compute the surface parameters from tile shifts
1767       Value *tile_shift[3];
1768       Value *tile_size[3];
1769       Value *tile_mask[3];
1770       // We only ever use one kind of X-tiling.
1771       tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
1772       tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
1773       tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
1774       // Fetch the "real" tiling parameters of the underlying surface
1775       for (int i = 1; i < 3; i++) {
1776          tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
1777          tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
1778          tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
1779       }
1780 
1781       // Compute the location of given coordinate, both inside the tile as
1782       // well as which (linearly-laid out) tile it's in.
1783       Value *coord_in_tile[3];
1784       Value *tile[3];
1785       for (int i = 0; i < 3; i++) {
1786          coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
1787          tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
1788       }
1789 
1790       // Based on the "real" tiling parameters, compute x/y coordinates in the
1791       // larger surface with 2d tiling that was supplied to the hardware. This
1792       // was determined and verified with the help of the tiling pseudocode in
1793       // the envytools docs.
1794       //
1795       // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
1796       //         z_coord_in_tile * x_tile_size
1797       // adj_y = y_coord_in_tile + y_tile * y_tile_size +
1798       //         z_tile * y_tile_size * y_tiles
1799       //
1800       // Note: STRIDE_Y = y_tile_size * y_tiles
1801 
1802       coords[0] = bld.mkOp2v(
1803             OP_ADD, TYPE_U16, bld.getSSA(2),
1804             bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1805                        coord_in_tile[0],
1806                        bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1807                                   tile[0],
1808                                   bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1809                                              tile_shift[2], tile_shift[0]))),
1810             bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1811                        coord_in_tile[2], tile_shift[0]));
1812 
1813       Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
1814                                    tile[2], y_size_aligned);
1815       mul->sType = TYPE_U16;
1816       Value *muls[2];
1817       bld.mkSplit(muls, 2, mul->getDef(0));
1818 
1819       coords[1] = bld.mkOp2v(
1820             OP_ADD, TYPE_U16, bld.getSSA(2),
1821             muls[0],
1822             bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1823                        coord_in_tile[1],
1824                        bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1825                                   tile[1], tile_shift[1])));
1826    }
1827 
1828    return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
1829 }
1830 
1831 // This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
1832 // adjusted to make use of 16-bit math where possible.
1833 bool
handleSULDP(TexInstruction * su)1834 NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
1835 {
1836    const int slot = su->tex.r;
1837    assert(!su->getIndirectR());
1838 
1839    bld.setPosition(su, false);
1840 
1841    const TexInstruction::ImgFormatDesc *format = su->tex.format;
1842    const int bytes = (su->tex.format->bits[0] +
1843                       su->tex.format->bits[1] +
1844                       su->tex.format->bits[2] +
1845                       su->tex.format->bits[3]) / 8;
1846    DataType ty = typeOfSize(bytes);
1847 
1848    Value *coord = processSurfaceCoords(su);
1849 
1850    Value *untypedDst[4] = {};
1851    Value *typedDst[4] = {};
1852    int i;
1853    for (i = 0; i < bytes / 4; i++)
1854       untypedDst[i] = bld.getSSA();
1855    if (bytes < 4)
1856       untypedDst[0] = bld.getSSA();
1857 
1858    for (i = 0; i < 4; i++)
1859       typedDst[i] = su->getDef(i);
1860 
1861    Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
1862    for (i = 0; i < 4 && untypedDst[i]; i++)
1863       load->setDef(i, untypedDst[i]);
1864 
1865    // Unpack each component into the typed dsts
1866    int bits = 0;
1867    for (int i = 0; i < 4; bits += format->bits[i], i++) {
1868       if (!typedDst[i])
1869          continue;
1870 
1871       if (i >= format->components) {
1872          if (format->type == FLOAT ||
1873              format->type == UNORM ||
1874              format->type == SNORM)
1875             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
1876          else
1877             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
1878          continue;
1879       }
1880 
1881       // Get just that component's data into the relevant place
1882       if (format->bits[i] == 32)
1883          bld.mkMov(typedDst[i], untypedDst[i]);
1884       else if (format->bits[i] == 16) {
1885          // We can always convert directly from the appropriate half of the
1886          // loaded value into the typed result.
1887          Value *src[2];
1888          bld.mkSplit(src, 2, untypedDst[i / 2]);
1889          bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1890                    getPackedType(format, i), src[i & 1]);
1891       }
1892       else if (format->bits[i] == 8) {
1893          // Same approach as for 16 bits, but we have to massage the value a
1894          // bit more, since we have to get the appropriate 8 bits from the
1895          // half-register. In all cases, we can CVT from a 8-bit source, so we
1896          // only have to shift when we want the upper 8 bits.
1897          Value *src[2], *shifted;
1898          bld.mkSplit(src, 2, untypedDst[0]);
1899          DataType packedType = getPackedType(format, i);
1900          if (i & 1)
1901             shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
1902          else
1903             shifted = src[!!(i & 2)];
1904 
1905          bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1906                    packedType, shifted);
1907       }
1908       else {
1909          // The options are 10, 11, and 2. Get it into a 32-bit reg, then
1910          // shift/mask. That's where it'll have to end up anyways. For signed,
1911          // we have to make sure to get sign-extension, so we actually have to
1912          // shift *up* first, and then shift down. There's no advantage to
1913          // AND'ing, so we don't.
1914          DataType ty = TYPE_U32;
1915          if (format->type == SNORM || format->type == SINT) {
1916             ty = TYPE_S32;
1917          }
1918 
1919          // Poor man's EXTBF
1920          bld.mkOp2(
1921                OP_SHR, ty, typedDst[i],
1922                bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
1923                bld.loadImm(NULL, 32 - format->bits[i]));
1924 
1925          // If the stored data is already in the appropriate type, we don't
1926          // have to do anything. Convert to float for the *NORM formats.
1927          if (format->type == UNORM || format->type == SNORM)
1928             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
1929       }
1930 
1931       // Normalize / convert as necessary
1932       if (format->type == UNORM)
1933          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
1934       else if (format->type == SNORM)
1935          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
1936       else if (format->type == FLOAT && format->bits[i] < 16) {
1937          // We expect the value to be in the low bits of the register, so we
1938          // have to shift back up.
1939          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
1940          Value *src[2];
1941          bld.mkSplit(src, 2, typedDst[i]);
1942          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
1943       }
1944    }
1945 
1946    if (format->bgra) {
1947       std::swap(typedDst[0], typedDst[2]);
1948    }
1949 
1950    bld.getBB()->remove(su);
1951    return true;
1952 }
1953 
1954 bool
handleSUREDP(TexInstruction * su)1955 NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
1956 {
1957    const int slot = su->tex.r;
1958    const int dim = su->tex.target.getDim();
1959    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1960    assert(!su->getIndirectR());
1961 
1962    bld.setPosition(su, false);
1963 
1964    Value *coord = processSurfaceCoords(su);
1965 
1966    // This is guaranteed to be a 32-bit format. So there's nothing to
1967    // pack/unpack.
1968    Instruction *atom = bld.mkOp2(
1969          OP_ATOM, su->dType, su->getDef(0),
1970          bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
1971    if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
1972       atom->setSrc(2, su->getSrc(arg + 1));
1973    atom->setIndirect(0, 0, coord);
1974    atom->subOp = su->subOp;
1975 
1976    bld.getBB()->remove(su);
1977    return true;
1978 }
1979 
1980 bool
handleSUSTP(TexInstruction * su)1981 NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
1982 {
1983    const int slot = su->tex.r;
1984    const int dim = su->tex.target.getDim();
1985    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1986    assert(!su->getIndirectR());
1987 
1988    bld.setPosition(su, false);
1989 
1990    const TexInstruction::ImgFormatDesc *format = su->tex.format;
1991    const int bytes = (su->tex.format->bits[0] +
1992                       su->tex.format->bits[1] +
1993                       su->tex.format->bits[2] +
1994                       su->tex.format->bits[3]) / 8;
1995    DataType ty = typeOfSize(bytes);
1996 
1997    Value *coord = processSurfaceCoords(su);
1998 
1999    // The packed values we will eventually store into memory
2000    Value *untypedDst[4] = {};
2001    // Each component's packed representation, in 16-bit registers (only used
2002    // where appropriate)
2003    Value *untypedDst16[4] = {};
2004    // The original values that are being packed
2005    Value *typedDst[4] = {};
2006    int i;
2007 
2008    for (i = 0; i < bytes / 4; i++)
2009       untypedDst[i] = bld.getSSA();
2010    for (i = 0; i < format->components; i++)
2011       untypedDst16[i] = bld.getSSA(2);
2012    // Make sure we get at least one of each value allocated for the
2013    // super-narrow formats.
2014    if (bytes < 4)
2015       untypedDst[0] = bld.getSSA();
2016    if (bytes < 2)
2017       untypedDst16[0] = bld.getSSA(2);
2018 
2019    for (i = 0; i < 4; i++) {
2020       typedDst[i] = bld.getSSA();
2021       bld.mkMov(typedDst[i], su->getSrc(arg + i));
2022    }
2023 
2024    if (format->bgra) {
2025       std::swap(typedDst[0], typedDst[2]);
2026    }
2027 
2028    // Pack each component into the untyped dsts.
2029    int bits = 0;
2030    for (int i = 0; i < format->components; bits += format->bits[i], i++) {
2031       // Un-normalize / convert as necessary
2032       if (format->type == UNORM)
2033          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
2034       else if (format->type == SNORM)
2035          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
2036 
2037       // There is nothing to convert/pack for 32-bit values
2038       if (format->bits[i] == 32) {
2039          bld.mkMov(untypedDst[i], typedDst[i]);
2040          continue;
2041       }
2042 
2043       // The remainder of the cases will naturally want to deal in 16-bit
2044       // registers. We will put these into untypedDst16 and then merge them
2045       // together later.
2046       if (format->type == FLOAT && format->bits[i] < 16) {
2047          bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
2048          bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
2049 
2050          // For odd bit sizes, it's easier to pack it into the final
2051          // destination directly.
2052          Value *tmp = bld.getSSA();
2053          bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2054          if (i == 0) {
2055             untypedDst[0] = tmp;
2056          } else {
2057             bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2058             bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2059          }
2060       } else if (format->bits[i] == 16) {
2061          // We can always convert the shader value into the packed value
2062          // directly here
2063          bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
2064                    getShaderType(format->type), typedDst[i]);
2065       } else if (format->bits[i] < 16) {
2066          DataType packedType = getPackedType(format, i);
2067          DataType shaderType = getShaderType(format->type);
2068          // We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
2069          if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
2070             packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
2071          }
2072          bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
2073          // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
2074          // the size, it's easier to dump them into a 32-bit value and OR
2075          // everything later.
2076          if (format->bits[i] != 8) {
2077             // Restrict value to the appropriate bits (although maybe supposed
2078             // to clamp instead?)
2079             bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
2080             // And merge into final packed value
2081             Value *tmp = bld.getSSA();
2082             bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2083             if (i == 0) {
2084                untypedDst[0] = tmp;
2085             } else {
2086                bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2087                bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2088             }
2089          } else if (i & 1) {
2090             // Shift the 8-bit value up (so that it can be OR'd later)
2091             bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
2092          } else if (packedType != TYPE_U8) {
2093             // S8 (or the *16 if converted from float) will all have high bits
2094             // set, so AND them out.
2095             bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
2096          }
2097       }
2098    }
2099 
2100    // OR pairs of 8-bit values together (into the even value)
2101    if (format->bits[0] == 8) {
2102       for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
2103          bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
2104    }
2105 
2106    // We'll always want to have at least a 32-bit source register for the store
2107    Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
2108    if (format->bits[0] == 32) {
2109       for (i = 0; i < 4 && untypedDst[i]; i++)
2110          merge->setSrc(i, untypedDst[i]);
2111    } else if (format->bits[0] == 16) {
2112       for (i = 0; i < 4 && untypedDst16[i]; i++)
2113          merge->setSrc(i, untypedDst16[i]);
2114       if (i == 1)
2115          merge->setSrc(i, bld.getSSA(2));
2116    } else if (format->bits[0] == 8) {
2117       for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
2118          merge->setSrc(i, untypedDst16[2 * i]);
2119       if (i == 1)
2120          merge->setSrc(i, bld.getSSA(2));
2121    } else {
2122       merge->setSrc(0, untypedDst[0]);
2123    }
2124 
2125    bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
2126 
2127    bld.getBB()->remove(su);
2128    return true;
2129 }
2130 
2131 bool
handlePFETCH(Instruction * i)2132 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
2133 {
2134    assert(prog->getType() == Program::TYPE_GEOMETRY);
2135 
2136    // NOTE: cannot use getImmediate here, not in SSA form yet, move to
2137    // later phase if that assertion ever triggers:
2138 
2139    ImmediateValue *imm = i->getSrc(0)->asImm();
2140    assert(imm);
2141 
2142    assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
2143 
2144    if (i->srcExists(1)) {
2145       // indirect addressing of vertex in primitive space
2146 
2147       LValue *val = bld.getScratch();
2148       Value *ptr = bld.getSSA(2, FILE_ADDRESS);
2149       bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
2150       bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
2151 
2152       // NOTE: PFETCH directly to an $aX only works with direct addressing
2153       i->op = OP_SHL;
2154       i->setSrc(0, val);
2155       i->setSrc(1, bld.mkImm(0));
2156    }
2157 
2158    return true;
2159 }
2160 
2161 // Set flags according to predicate and make the instruction read $cX.
2162 void
checkPredicate(Instruction * insn)2163 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
2164 {
2165    Value *pred = insn->getPredicate();
2166    Value *cdst;
2167 
2168    // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
2169    if (!pred ||
2170        pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
2171       return;
2172 
2173    cdst = bld.getSSA(1, FILE_FLAGS);
2174 
2175    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
2176 
2177    insn->setPredicate(insn->cc, cdst);
2178 }
2179 
2180 //
2181 // - add quadop dance for texturing
2182 // - put FP outputs in GPRs
2183 // - convert instruction sequences
2184 //
2185 bool
visit(Instruction * i)2186 NV50LoweringPreSSA::visit(Instruction *i)
2187 {
2188    bld.setPosition(i, false);
2189 
2190    if (i->cc != CC_ALWAYS)
2191       checkPredicate(i);
2192 
2193    switch (i->op) {
2194    case OP_TEX:
2195    case OP_TXF:
2196    case OP_TXG:
2197       return handleTEX(i->asTex());
2198    case OP_TXB:
2199       return handleTXB(i->asTex());
2200    case OP_TXL:
2201       return handleTXL(i->asTex());
2202    case OP_TXD:
2203       return handleTXD(i->asTex());
2204    case OP_TXLQ:
2205       return handleTXLQ(i->asTex());
2206    case OP_TXQ:
2207       return handleTXQ(i->asTex());
2208    case OP_EX2:
2209       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2210       i->setSrc(0, i->getDef(0));
2211       break;
2212    case OP_SET:
2213       return handleSET(i);
2214    case OP_SLCT:
2215       return handleSLCT(i->asCmp());
2216    case OP_SELP:
2217       return handleSELP(i);
2218    case OP_POW:
2219       return handlePOW(i);
2220    case OP_DIV:
2221       return handleDIV(i);
2222    case OP_SQRT:
2223       return handleSQRT(i);
2224    case OP_EXPORT:
2225       return handleEXPORT(i);
2226    case OP_LOAD:
2227       return handleLOAD(i);
2228    case OP_MEMBAR:
2229       return handleMEMBAR(i);
2230    case OP_ATOM:
2231    case OP_STORE:
2232       return handleLDST(i);
2233    case OP_SULDP:
2234       return handleSULDP(i->asTex());
2235    case OP_SUSTP:
2236       return handleSUSTP(i->asTex());
2237    case OP_SUREDP:
2238       return handleSUREDP(i->asTex());
2239    case OP_SUQ:
2240       return handleSUQ(i->asTex());
2241    case OP_BUFQ:
2242       return handleBUFQ(i);
2243    case OP_RDSV:
2244       return handleRDSV(i);
2245    case OP_WRSV:
2246       return handleWRSV(i);
2247    case OP_CALL:
2248       return handleCALL(i);
2249    case OP_PRECONT:
2250       return handlePRECONT(i);
2251    case OP_CONT:
2252       return handleCONT(i);
2253    case OP_PFETCH:
2254       return handlePFETCH(i);
2255    default:
2256       break;
2257    }
2258    return true;
2259 }
2260 
2261 bool
runLegalizePass(Program * prog,CGStage stage) const2262 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
2263 {
2264    bool ret = false;
2265 
2266    if (stage == CG_STAGE_PRE_SSA) {
2267       NV50LoweringPreSSA pass(prog);
2268       ret = pass.run(prog, false, true);
2269    } else
2270    if (stage == CG_STAGE_SSA) {
2271       if (!prog->targetPriv)
2272          prog->targetPriv = new std::list<Instruction *>();
2273       NV50LegalizeSSA pass(prog);
2274       ret = pass.run(prog, false, true);
2275    } else
2276    if (stage == CG_STAGE_POST_RA) {
2277       NV50LegalizePostRA pass;
2278       ret = pass.run(prog, false, true);
2279       if (prog->targetPriv)
2280          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
2281    }
2282    return ret;
2283 }
2284 
2285 } // namespace nv50_ir
2286