1 /* libs/pixelflinger/codeflinger/texturing.cpp
2 **
3 ** Copyright 2006, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #define LOG_TAG "pixelflinger-code"
19 
20 #include <assert.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <sys/types.h>
25 
26 #include <log/log.h>
27 
28 #include "GGLAssembler.h"
29 
30 namespace android {
31 
32 // ---------------------------------------------------------------------------
33 
34 // iterators are initialized like this:
35 // (intToFixedCenter(x) * dx)>>16 + x0
36 // ((x<<16 + 0x8000) * dx)>>16 + x0
37 // ((x<<16)*dx + (0x8000*dx))>>16 + x0
38 // ( (x*dx) + dx>>1 ) + x0
39 // (x*dx) + (dx>>1 + x0)
40 
init_iterated_color(fragment_parts_t & parts,const reg_t & x)41 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
42 {
43     context_t const* c = mBuilderContext.c;
44 
45     if (mSmooth) {
46         // NOTE: we could take this case in the mDithering + !mSmooth case,
47         // but this would use up to 4 more registers for the color components
48         // for only a little added quality.
49         // Currently, this causes the system to run out of registers in
50         // some case (see issue #719496)
51 
52         comment("compute initial iterated color (smooth and/or dither case)");
53 
54         parts.iterated_packed = 0;
55         parts.packed = 0;
56 
57         // 0x1: color component
58         // 0x2: iterators
59         const int optReload = mOptLevel >> 1;
60         if (optReload >= 3)         parts.reload = 0; // reload nothing
61         else if (optReload == 2)    parts.reload = 2; // reload iterators
62         else if (optReload == 1)    parts.reload = 1; // reload colors
63         else if (optReload <= 0)    parts.reload = 3; // reload both
64 
65         if (!mSmooth) {
66             // we're not smoothing (just dithering), we never have to
67             // reload the iterators
68             parts.reload &= ~2;
69         }
70 
71         Scratch scratches(registerFile());
72         const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
73         const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
74         for (int i=0 ; i<4 ; i++) {
75             if (!mInfo[i].iterated)
76                 continue;
77 
78             // this component exists in the destination and is not replaced
79             // by a texture unit.
80             const int c = (parts.reload & 1) ? t0 : obtainReg();
81             if (i==0) CONTEXT_LOAD(c, iterators.ydady);
82             if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
83             if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
84             if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
85             parts.argb[i].reg = c;
86 
87             if (mInfo[i].smooth) {
88                 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
89                 const int dvdx = parts.argb_dx[i].reg;
90                 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
91                 MLA(AL, 0, c, x.reg, dvdx, c);
92 
93                 // adjust the color iterator to make sure it won't overflow
94                 if (!mAA) {
95                     // this is not needed when we're using anti-aliasing
96                     // because we will (have to) clamp the components
97                     // anyway.
98                     int end = scratches.obtain();
99                     MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
100                     MLA(AL, 1, end, dvdx, end, c);
101                     SUB(MI, 0, c, c, end);
102                     BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
103                     scratches.recycle(end);
104                 }
105             }
106 
107             if (parts.reload & 1) {
108                 CONTEXT_STORE(c, generated_vars.argb[i].c);
109             }
110         }
111     } else {
112         // We're not smoothed, so we can
113         // just use a packed version of the color and extract the
114         // components as needed (or not at all if we don't blend)
115 
116         // figure out if we need the iterated color
117         int load = 0;
118         for (int i=0 ; i<4 ; i++) {
119             component_info_t& info = mInfo[i];
120             if ((info.inDest || info.needed) && !info.replaced)
121                 load |= 1;
122         }
123 
124         parts.iterated_packed = 1;
125         parts.packed = (!mTextureMachine.mask && !mBlending
126                 && !mFog && !mDithering);
127         parts.reload = 0;
128         if (load || parts.packed) {
129             if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
130                 comment("load initial iterated color (8888 packed)");
131                 parts.iterated.setTo(obtainReg(),
132                         &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
133                 CONTEXT_LOAD(parts.iterated.reg, packed8888);
134             } else {
135                 comment("load initial iterated color (dest format packed)");
136 
137                 parts.iterated.setTo(obtainReg(), &mCbFormat);
138 
139                 // pre-mask the iterated color
140                 const int bits = parts.iterated.size();
141                 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
142                 uint32_t mask = 0;
143                 if (mMasking) {
144                     for (int i=0 ; i<4 ; i++) {
145                         const int component_mask = 1<<i;
146                         const int h = parts.iterated.format.c[i].h;
147                         const int l = parts.iterated.format.c[i].l;
148                         if (h && (!(mMasking & component_mask))) {
149                             mask |= ((1<<(h-l))-1) << l;
150                         }
151                     }
152                 }
153 
154                 if (mMasking && ((mask & size)==0)) {
155                     // none of the components are present in the mask
156                 } else {
157                     CONTEXT_LOAD(parts.iterated.reg, packed);
158                     if (mCbFormat.size == 1) {
159                         AND(AL, 0, parts.iterated.reg,
160                                 parts.iterated.reg, imm(0xFF));
161                     } else if (mCbFormat.size == 2) {
162                         MOV(AL, 0, parts.iterated.reg,
163                                 reg_imm(parts.iterated.reg, LSR, 16));
164                     }
165                 }
166 
167                 // pre-mask the iterated color
168                 if (mMasking) {
169                     build_and_immediate(parts.iterated.reg, parts.iterated.reg,
170                             mask, bits);
171                 }
172             }
173         }
174     }
175 }
176 
build_iterated_color(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)177 void GGLAssembler::build_iterated_color(
178         component_t& fragment,
179         const fragment_parts_t& parts,
180         int component,
181         Scratch& regs)
182 {
183     fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
184 
185     if (!mInfo[component].iterated)
186         return;
187 
188     if (parts.iterated_packed) {
189         // iterated colors are packed, extract the one we need
190         extract(fragment, parts.iterated, component);
191     } else {
192         fragment.h = GGL_COLOR_BITS;
193         fragment.l = GGL_COLOR_BITS - 8;
194         fragment.flags |= CLEAR_LO;
195         // iterated colors are held in their own register,
196         // (smooth and/or dithering case)
197         if (parts.reload==3) {
198             // this implies mSmooth
199             Scratch scratches(registerFile());
200             int dx = scratches.obtain();
201             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
202             CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
203             ADD(AL, 0, dx, fragment.reg, dx);
204             CONTEXT_STORE(dx, generated_vars.argb[component].c);
205         } else if (parts.reload & 1) {
206             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
207         } else {
208             // we don't reload, so simply rename the register and mark as
209             // non CORRUPTIBLE so that the texture env or blending code
210             // won't modify this (renamed) register
211             regs.recycle(fragment.reg);
212             fragment.reg = parts.argb[component].reg;
213             fragment.flags &= ~CORRUPTIBLE;
214         }
215         if (mInfo[component].smooth && mAA) {
216             // when using smooth shading AND anti-aliasing, we need to clamp
217             // the iterators because there is always an extra pixel on the
218             // edges, which most of the time will cause an overflow
219             // (since technically its outside of the domain).
220             BIC(AL, 0, fragment.reg, fragment.reg,
221                     reg_imm(fragment.reg, ASR, 31));
222             component_sat(fragment);
223         }
224     }
225 }
226 
227 // ---------------------------------------------------------------------------
228 
decodeLogicOpNeeds(const needs_t & needs)229 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
230 {
231     // gather some informations about the components we need to process...
232     const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
233     switch(opcode) {
234     case GGL_COPY:
235         mLogicOp = 0;
236         break;
237     case GGL_CLEAR:
238     case GGL_SET:
239         mLogicOp = LOGIC_OP;
240         break;
241     case GGL_AND:
242     case GGL_AND_REVERSE:
243     case GGL_AND_INVERTED:
244     case GGL_XOR:
245     case GGL_OR:
246     case GGL_NOR:
247     case GGL_EQUIV:
248     case GGL_OR_REVERSE:
249     case GGL_OR_INVERTED:
250     case GGL_NAND:
251         mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
252         break;
253     case GGL_NOOP:
254     case GGL_INVERT:
255         mLogicOp = LOGIC_OP|LOGIC_OP_DST;
256         break;
257     case GGL_COPY_INVERTED:
258         mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
259         break;
260     };
261 }
262 
decodeTMUNeeds(const needs_t & needs,context_t const * c)263 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
264 {
265     uint8_t replaced=0;
266     mTextureMachine.mask = 0;
267     mTextureMachine.activeUnits = 0;
268     for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
269         texture_unit_t& tmu = mTextureMachine.tmu[i];
270         if (replaced == 0xF) {
271             // all components are replaced, skip this TMU.
272             tmu.format_idx = 0;
273             tmu.mask = 0;
274             tmu.replaced = replaced;
275             continue;
276         }
277         tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
278         tmu.format = c->formats[tmu.format_idx];
279         tmu.bits = tmu.format.size*8;
280         tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
281         tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
282         tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
283         tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
284         tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
285                 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
286 
287         // 5551 linear filtering is not supported
288         if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
289             tmu.linear = 0;
290 
291         tmu.mask = 0;
292         tmu.replaced = replaced;
293 
294         if (tmu.format_idx) {
295             mTextureMachine.activeUnits++;
296             if (tmu.format.c[0].h)    tmu.mask |= 0x1;
297             if (tmu.format.c[1].h)    tmu.mask |= 0x2;
298             if (tmu.format.c[2].h)    tmu.mask |= 0x4;
299             if (tmu.format.c[3].h)    tmu.mask |= 0x8;
300             if (tmu.env == GGL_REPLACE) {
301                 replaced |= tmu.mask;
302             } else if (tmu.env == GGL_DECAL) {
303                 if (!tmu.format.c[GGLFormat::ALPHA].h) {
304                     // if we don't have alpha, decal does nothing
305                     tmu.mask = 0;
306                 } else {
307                     // decal always ignores At
308                     tmu.mask &= ~(1<<GGLFormat::ALPHA);
309                 }
310             }
311         }
312         mTextureMachine.mask |= tmu.mask;
313         //printf("%d: mask=%08lx, replaced=%08lx\n",
314         //    i, int(tmu.mask), int(tmu.replaced));
315     }
316     mTextureMachine.replaced = replaced;
317     mTextureMachine.directTexture = 0;
318     //printf("replaced=%08lx\n", mTextureMachine.replaced);
319 }
320 
321 
init_textures(tex_coord_t * coords,const reg_t & x,const reg_t & y)322 void GGLAssembler::init_textures(
323         tex_coord_t* coords,
324         const reg_t& x, const reg_t& y)
325 {
326     const needs_t& needs = mBuilderContext.needs;
327     int Rx = x.reg;
328     int Ry = y.reg;
329 
330     if (mTextureMachine.mask) {
331         comment("compute texture coordinates");
332     }
333 
334     // init texture coordinates for each tmu
335     const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
336     const bool multiTexture = mTextureMachine.activeUnits > 1;
337     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
338         const texture_unit_t& tmu = mTextureMachine.tmu[i];
339         if (tmu.format_idx == 0)
340             continue;
341         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
342             (tmu.twrap == GGL_NEEDS_WRAP_11))
343         {
344             // 1:1 texture
345             pointer_t& txPtr = coords[i].ptr;
346             txPtr.setTo(obtainReg(), tmu.bits);
347             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
348             ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
349             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
350             ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
351             // merge base & offset
352             CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
353             SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
354             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
355             base_offset(txPtr, txPtr, Rx);
356         } else {
357             Scratch scratches(registerFile());
358             reg_t& s = coords[i].s;
359             reg_t& t = coords[i].t;
360             // s = (x * dsdx)>>16 + ydsdy
361             // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
362             // t = (x * dtdx)>>16 + ydtdy
363             // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
364             s.setTo(obtainReg());
365             t.setTo(obtainReg());
366             const int need_w = GGL_READ_NEEDS(W, needs.n);
367             if (need_w) {
368                 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
369                 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
370             } else {
371                 int ydsdy = scratches.obtain();
372                 int ydtdy = scratches.obtain();
373                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
374                 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
375                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
376                 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
377                 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
378                 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
379             }
380 
381             if ((mOptLevel&1)==0) {
382                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
383                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
384                 recycleReg(s.reg);
385                 recycleReg(t.reg);
386             }
387         }
388 
389         // direct texture?
390         if (!multiTexture && !mBlending && !mDithering && !mFog &&
391             cb_format_idx == tmu.format_idx && !tmu.linear &&
392             mTextureMachine.replaced == tmu.mask)
393         {
394                 mTextureMachine.directTexture = i + 1;
395         }
396     }
397 }
398 
build_textures(fragment_parts_t & parts,Scratch & regs)399 void GGLAssembler::build_textures(  fragment_parts_t& parts,
400                                     Scratch& regs)
401 {
402     // We don't have a way to spill registers automatically
403     // spill depth and AA regs, when we know we may have to.
404     // build the spill list...
405     uint32_t spill_list = 0;
406     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
407         const texture_unit_t& tmu = mTextureMachine.tmu[i];
408         if (tmu.format_idx == 0)
409             continue;
410         if (tmu.linear) {
411             // we may run out of register if we have linear filtering
412             // at 1 or 4 bytes / pixel on any texture unit.
413             if (tmu.format.size == 1) {
414                 // if depth and AA enabled, we'll run out of 1 register
415                 if (parts.z.reg > 0 && parts.covPtr.reg > 0)
416                     spill_list |= 1<<parts.covPtr.reg;
417             }
418             if (tmu.format.size == 4) {
419                 // if depth or AA enabled, we'll run out of 1 or 2 registers
420                 if (parts.z.reg > 0)
421                     spill_list |= 1<<parts.z.reg;
422                 if (parts.covPtr.reg > 0)
423                     spill_list |= 1<<parts.covPtr.reg;
424             }
425         }
426     }
427 
428     Spill spill(registerFile(), *this, spill_list);
429 
430     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
431         const texture_unit_t& tmu = mTextureMachine.tmu[i];
432         if (tmu.format_idx == 0)
433             continue;
434 
435         pointer_t& txPtr = parts.coords[i].ptr;
436         pixel_t& texel = parts.texel[i];
437 
438         // repeat...
439         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
440             (tmu.twrap == GGL_NEEDS_WRAP_11))
441         { // 1:1 textures
442             comment("fetch texel");
443             texel.setTo(regs.obtain(), &tmu.format);
444             load(txPtr, texel, WRITE_BACK);
445         } else {
446             Scratch scratches(registerFile());
447             reg_t& s = parts.coords[i].s;
448             reg_t& t = parts.coords[i].t;
449             if ((mOptLevel&1)==0) {
450                 comment("reload s/t (multitexture or linear filtering)");
451                 s.reg = scratches.obtain();
452                 t.reg = scratches.obtain();
453                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
454                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
455             }
456 
457             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
458                 return;
459 
460             comment("compute repeat/clamp");
461             int u       = scratches.obtain();
462             int v       = scratches.obtain();
463             int width   = scratches.obtain();
464             int height  = scratches.obtain();
465             int U = 0;
466             int V = 0;
467 
468             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
469                 return;
470 
471             CONTEXT_LOAD(width,  generated_vars.texture[i].width);
472             CONTEXT_LOAD(height, generated_vars.texture[i].height);
473 
474             int FRAC_BITS = 0;
475             if (tmu.linear) {
476                 // linear interpolation
477                 if (tmu.format.size == 1) {
478                     // for 8-bits textures, we can afford
479                     // 7 bits of fractional precision at no
480                     // additional cost (we can't do 8 bits
481                     // because filter8 uses signed 16 bits muls)
482                     FRAC_BITS = 7;
483                 } else if (tmu.format.size == 2) {
484                     // filter16() is internally limited to 4 bits, so:
485                     // FRAC_BITS=2 generates less instructions,
486                     // FRAC_BITS=3,4,5 creates unpleasant artifacts,
487                     // FRAC_BITS=6+ looks good
488                     FRAC_BITS = 6;
489                 } else if (tmu.format.size == 4) {
490                     // filter32() is internally limited to 8 bits, so:
491                     // FRAC_BITS=4 looks good
492                     // FRAC_BITS=5+ looks better, but generates 3 extra ipp
493                     FRAC_BITS = 6;
494                 } else {
495                     // for all other cases we use 4 bits.
496                     FRAC_BITS = 4;
497                 }
498             }
499             wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
500             wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
501 
502             if (tmu.linear) {
503                 comment("compute linear filtering offsets");
504                 // pixel size scale
505                 const int shift = 31 - gglClz(tmu.format.size);
506                 U = scratches.obtain();
507                 V = scratches.obtain();
508 
509                 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
510                     return;
511 
512                 // sample the texel center
513                 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
514                 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
515 
516                 // get the fractionnal part of U,V
517                 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
518                 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
519 
520                 // compute width-1 and height-1
521                 SUB(AL, 0, width,  width,  imm(1));
522                 SUB(AL, 0, height, height, imm(1));
523 
524                 // get the integer part of U,V and clamp/wrap
525                 // and compute offset to the next texel
526                 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
527                     // u has already been REPEATed
528                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
529                     MOV(MI, 0, u, width);
530                     CMP(AL, u, width);
531                     MOV(LT, 0, width, imm(1 << shift));
532                     if (shift)
533                         MOV(GE, 0, width, reg_imm(width, LSL, shift));
534                     RSB(GE, 0, width, width, imm(0));
535                 } else {
536                     // u has not been CLAMPed yet
537                     // algorithm:
538                     // if ((u>>4) >= width)
539                     //      u = width<<4
540                     //      width = 0
541                     // else
542                     //      width = 1<<shift
543                     // u = u>>4; // get integer part
544                     // if (u<0)
545                     //      u = 0
546                     //      width = 0
547                     // generated_vars.rt = width
548 
549                     CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
550                     MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
551                     MOV(LE, 0, width, imm(0));
552                     MOV(GT, 0, width, imm(1 << shift));
553                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
554                     MOV(MI, 0, u, imm(0));
555                     MOV(MI, 0, width, imm(0));
556                 }
557                 CONTEXT_STORE(width, generated_vars.rt);
558 
559                 const int stride = width;
560                 CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
561                 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
562                     // v has already been REPEATed
563                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
564                     MOV(MI, 0, v, height);
565                     CMP(AL, v, height);
566                     MOV(LT, 0, height, imm(1 << shift));
567                     if (shift)
568                         MOV(GE, 0, height, reg_imm(height, LSL, shift));
569                     RSB(GE, 0, height, height, imm(0));
570                     MUL(AL, 0, height, stride, height);
571                 } else {
572                     // v has not been CLAMPed yet
573                     CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
574                     MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
575                     MOV(LE, 0, height, imm(0));
576                     if (shift) {
577                         MOV(GT, 0, height, reg_imm(stride, LSL, shift));
578                     } else {
579                         MOV(GT, 0, height, stride);
580                     }
581                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
582                     MOV(MI, 0, v, imm(0));
583                     MOV(MI, 0, height, imm(0));
584                 }
585                 CONTEXT_STORE(height, generated_vars.lb);
586             }
587 
588             scratches.recycle(width);
589             scratches.recycle(height);
590 
591             // iterate texture coordinates...
592             comment("iterate s,t");
593             int dsdx = scratches.obtain();
594             int dtdx = scratches.obtain();
595 
596             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
597                 return;
598 
599             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
600             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
601             ADD(AL, 0, s.reg, s.reg, dsdx);
602             ADD(AL, 0, t.reg, t.reg, dtdx);
603             if ((mOptLevel&1)==0) {
604                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
605                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
606                 scratches.recycle(s.reg);
607                 scratches.recycle(t.reg);
608             }
609             scratches.recycle(dsdx);
610             scratches.recycle(dtdx);
611 
612             // merge base & offset...
613             comment("merge base & offset");
614             texel.setTo(regs.obtain(), &tmu.format);
615             txPtr.setTo(texel.reg, tmu.bits);
616             int stride = scratches.obtain();
617 
618             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
619                 return;
620 
621             CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
622             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
623             SMLABB(AL, u, v, stride, u);    // u+v*stride
624             base_offset(txPtr, txPtr, u);
625 
626             // load texel
627             if (!tmu.linear) {
628                 comment("fetch texel");
629                 load(txPtr, texel, 0);
630             } else {
631                 // recycle registers we don't need anymore
632                 scratches.recycle(u);
633                 scratches.recycle(v);
634                 scratches.recycle(stride);
635 
636                 comment("fetch texel, bilinear");
637                 switch (tmu.format.size) {
638                 case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
639                 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
640                 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
641                 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
642                 }
643             }
644         }
645     }
646 }
647 
build_iterate_texture_coordinates(const fragment_parts_t & parts)648 void GGLAssembler::build_iterate_texture_coordinates(
649     const fragment_parts_t& parts)
650 {
651     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
652         const texture_unit_t& tmu = mTextureMachine.tmu[i];
653         if (tmu.format_idx == 0)
654             continue;
655 
656         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
657             (tmu.twrap == GGL_NEEDS_WRAP_11))
658         { // 1:1 textures
659             const pointer_t& txPtr = parts.coords[i].ptr;
660             ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
661         } else {
662             Scratch scratches(registerFile());
663             int s = parts.coords[i].s.reg;
664             int t = parts.coords[i].t.reg;
665             if ((mOptLevel&1)==0) {
666                 s = scratches.obtain();
667                 t = scratches.obtain();
668                 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
669                 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
670             }
671             int dsdx = scratches.obtain();
672             int dtdx = scratches.obtain();
673             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
674             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
675             ADD(AL, 0, s, s, dsdx);
676             ADD(AL, 0, t, t, dtdx);
677             if ((mOptLevel&1)==0) {
678                 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
679                 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
680             }
681         }
682     }
683 }
684 
filter8(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)685 void GGLAssembler::filter8(
686         const fragment_parts_t& /*parts*/,
687         pixel_t& texel, const texture_unit_t& tmu,
688         int U, int V, pointer_t& txPtr,
689         int FRAC_BITS)
690 {
691     if (tmu.format.components != GGL_ALPHA &&
692         tmu.format.components != GGL_LUMINANCE)
693     {
694         // this is a packed format, and we don't support
695         // linear filtering (it's probably RGB 332)
696         // Should not happen with OpenGL|ES
697         LDRB(AL, texel.reg, txPtr.reg);
698         return;
699     }
700 
701     // ------------------------
702     // about ~22 cycles / pixel
703     Scratch scratches(registerFile());
704 
705     int pixel= scratches.obtain();
706     int d    = scratches.obtain();
707     int u    = scratches.obtain();
708     int k    = scratches.obtain();
709     int rt   = scratches.obtain();
710     int lb   = scratches.obtain();
711 
712     // RB -> U * V
713 
714     CONTEXT_LOAD(rt, generated_vars.rt);
715     CONTEXT_LOAD(lb, generated_vars.lb);
716 
717     int offset = pixel;
718     ADD(AL, 0, offset, lb, rt);
719     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
720     SMULBB(AL, u, U, V);
721     SMULBB(AL, d, pixel, u);
722     RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
723 
724     // LB -> (1-U) * V
725     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
726     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
727     SMULBB(AL, u, U, V);
728     SMLABB(AL, d, pixel, u, d);
729     SUB(AL, 0, k, k, u);
730 
731     // LT -> (1-U)*(1-V)
732     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
733     LDRB(AL, pixel, txPtr.reg);
734     SMULBB(AL, u, U, V);
735     SMLABB(AL, d, pixel, u, d);
736 
737     // RT -> U*(1-V)
738     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
739     SUB(AL, 0, u, k, u);
740     SMLABB(AL, texel.reg, pixel, u, d);
741 
742     for (int i=0 ; i<4 ; i++) {
743         if (!texel.format.c[i].h) continue;
744         texel.format.c[i].h = FRAC_BITS*2+8;
745         texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
746     }
747     texel.format.size = 4;
748     texel.format.bitsPerPixel = 32;
749     texel.flags |= CLEAR_LO;
750 }
751 
filter16(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)752 void GGLAssembler::filter16(
753         const fragment_parts_t& /*parts*/,
754         pixel_t& texel, const texture_unit_t& tmu,
755         int U, int V, pointer_t& txPtr,
756         int FRAC_BITS)
757 {
758     // compute the mask
759     // XXX: it would be nice if the mask below could be computed
760     // automatically.
761     uint32_t mask = 0;
762     int shift = 0;
763     int prec = 0;
764     switch (tmu.format_idx) {
765         case GGL_PIXEL_FORMAT_RGB_565:
766             // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
767             // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
768             mask = 0x07E0F81F;
769             shift = 16;
770             prec = 5;
771             break;
772         case GGL_PIXEL_FORMAT_RGBA_4444:
773             // 0000,1111,0000,1111 | 0000,1111,0000,1111
774             mask = 0x0F0F0F0F;
775             shift = 12;
776             prec = 4;
777             break;
778         case GGL_PIXEL_FORMAT_LA_88:
779             // 0000,0000,1111,1111 | 0000,0000,1111,1111
780             // AALL -> 00AA | 00LL
781             mask = 0x00FF00FF;
782             shift = 8;
783             prec = 8;
784             break;
785         default:
786             // unsupported format, do something sensical...
787             ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
788             LDRH(AL, texel.reg, txPtr.reg);
789             return;
790     }
791 
792     const int adjust = FRAC_BITS*2 - prec;
793     const int round  = 0;
794 
795     // update the texel format
796     texel.format.size = 4;
797     texel.format.bitsPerPixel = 32;
798     texel.flags |= CLEAR_HI|CLEAR_LO;
799     for (int i=0 ; i<4 ; i++) {
800         if (!texel.format.c[i].h) continue;
801         const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
802         texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
803         texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
804     }
805 
806     // ------------------------
807     // about ~40 cycles / pixel
808     Scratch scratches(registerFile());
809 
810     int pixel= scratches.obtain();
811     int d    = scratches.obtain();
812     int u    = scratches.obtain();
813     int k    = scratches.obtain();
814 
815     // RB -> U * V
816     int offset = pixel;
817     CONTEXT_LOAD(offset, generated_vars.rt);
818     CONTEXT_LOAD(u, generated_vars.lb);
819     ADD(AL, 0, offset, offset, u);
820 
821     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
822     SMULBB(AL, u, U, V);
823     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
824     build_and_immediate(pixel, pixel, mask, 32);
825     if (adjust) {
826         if (round)
827             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
828         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
829     }
830     MUL(AL, 0, d, pixel, u);
831     RSB(AL, 0, k, u, imm(1<<prec));
832 
833     // LB -> (1-U) * V
834     CONTEXT_LOAD(offset, generated_vars.lb);
835     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
836     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
837     SMULBB(AL, u, U, V);
838     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
839     build_and_immediate(pixel, pixel, mask, 32);
840     if (adjust) {
841         if (round)
842             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
843         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
844     }
845     MLA(AL, 0, d, pixel, u, d);
846     SUB(AL, 0, k, k, u);
847 
848     // LT -> (1-U)*(1-V)
849     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
850     LDRH(AL, pixel, txPtr.reg);
851     SMULBB(AL, u, U, V);
852     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
853     build_and_immediate(pixel, pixel, mask, 32);
854     if (adjust) {
855         if (round)
856             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
857         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
858     }
859     MLA(AL, 0, d, pixel, u, d);
860 
861     // RT -> U*(1-V)
862     CONTEXT_LOAD(offset, generated_vars.rt);
863     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
864     SUB(AL, 0, u, k, u);
865     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
866     build_and_immediate(pixel, pixel, mask, 32);
867     MLA(AL, 0, texel.reg, pixel, u, d);
868 }
869 
filter24(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int,int,pointer_t & txPtr,int)870 void GGLAssembler::filter24(
871         const fragment_parts_t& /*parts*/,
872         pixel_t& texel, const texture_unit_t& /*tmu*/,
873         int /*U*/, int /*V*/, pointer_t& txPtr,
874         int /*FRAC_BITS*/)
875 {
876     // not supported yet (currently disabled)
877     load(txPtr, texel, 0);
878 }
879 
filter32(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int U,int V,pointer_t & txPtr,int FRAC_BITS)880 void GGLAssembler::filter32(
881         const fragment_parts_t& /*parts*/,
882         pixel_t& texel, const texture_unit_t& /*tmu*/,
883         int U, int V, pointer_t& txPtr,
884         int FRAC_BITS)
885 {
886     const int adjust = FRAC_BITS*2 - 8;
887     const int round  = 0;
888 
889     // ------------------------
890     // about ~38 cycles / pixel
891     Scratch scratches(registerFile());
892 
893     int pixel= scratches.obtain();
894     int dh   = scratches.obtain();
895     int u    = scratches.obtain();
896     int k    = scratches.obtain();
897 
898     int temp = scratches.obtain();
899     int dl   = scratches.obtain();
900     int mask = scratches.obtain();
901 
902     MOV(AL, 0, mask, imm(0xFF));
903     ORR(AL, 0, mask, mask, imm(0xFF0000));
904 
905     // RB -> U * V
906     int offset = pixel;
907     CONTEXT_LOAD(offset, generated_vars.rt);
908     CONTEXT_LOAD(u, generated_vars.lb);
909     ADD(AL, 0, offset, offset, u);
910 
911     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
912     SMULBB(AL, u, U, V);
913     AND(AL, 0, temp, mask, pixel);
914     if (adjust) {
915         if (round)
916             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
917         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
918     }
919     MUL(AL, 0, dh, temp, u);
920     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
921     MUL(AL, 0, dl, temp, u);
922     RSB(AL, 0, k, u, imm(0x100));
923 
924     // LB -> (1-U) * V
925     CONTEXT_LOAD(offset, generated_vars.lb);
926     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
927     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
928     SMULBB(AL, u, U, V);
929     AND(AL, 0, temp, mask, pixel);
930     if (adjust) {
931         if (round)
932             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
933         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
934     }
935     MLA(AL, 0, dh, temp, u, dh);
936     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
937     MLA(AL, 0, dl, temp, u, dl);
938     SUB(AL, 0, k, k, u);
939 
940     // LT -> (1-U)*(1-V)
941     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
942     LDR(AL, pixel, txPtr.reg);
943     SMULBB(AL, u, U, V);
944     AND(AL, 0, temp, mask, pixel);
945     if (adjust) {
946         if (round)
947             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
948         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
949     }
950     MLA(AL, 0, dh, temp, u, dh);
951     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
952     MLA(AL, 0, dl, temp, u, dl);
953 
954     // RT -> U*(1-V)
955     CONTEXT_LOAD(offset, generated_vars.rt);
956     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
957     SUB(AL, 0, u, k, u);
958     AND(AL, 0, temp, mask, pixel);
959     MLA(AL, 0, dh, temp, u, dh);
960     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
961     MLA(AL, 0, dl, temp, u, dl);
962 
963     AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
964     AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
965     ORR(AL, 0, texel.reg, dh, dl);
966 }
967 
build_texture_environment(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)968 void GGLAssembler::build_texture_environment(
969         component_t& fragment,
970         const fragment_parts_t& parts,
971         int component,
972         Scratch& regs)
973 {
974     const uint32_t component_mask = 1<<component;
975     const bool multiTexture = mTextureMachine.activeUnits > 1;
976     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
977         texture_unit_t& tmu = mTextureMachine.tmu[i];
978 
979         if (tmu.mask & component_mask) {
980             // replace or modulate with this texture
981             if ((tmu.replaced & component_mask) == 0) {
982                 // not replaced by a later tmu...
983 
984                 Scratch scratches(registerFile());
985                 pixel_t texel(parts.texel[i]);
986 
987                 if (multiTexture &&
988                     tmu.swrap == GGL_NEEDS_WRAP_11 &&
989                     tmu.twrap == GGL_NEEDS_WRAP_11)
990                 {
991                     texel.reg = scratches.obtain();
992                     texel.flags |= CORRUPTIBLE;
993                     comment("fetch texel (multitexture 1:1)");
994                     load(parts.coords[i].ptr, texel, WRITE_BACK);
995                  }
996 
997                 component_t incoming(fragment);
998                 modify(fragment, regs);
999 
1000                 switch (tmu.env) {
1001                 case GGL_REPLACE:
1002                     extract(fragment, texel, component);
1003                     break;
1004                 case GGL_MODULATE:
1005                     modulate(fragment, incoming, texel, component);
1006                     break;
1007                 case GGL_DECAL:
1008                     decal(fragment, incoming, texel, component);
1009                     break;
1010                 case GGL_BLEND:
1011                     blend(fragment, incoming, texel, component, i);
1012                     break;
1013                 case GGL_ADD:
1014                     add(fragment, incoming, texel, component);
1015                     break;
1016                 }
1017             }
1018         }
1019     }
1020 }
1021 
1022 // ---------------------------------------------------------------------------
1023 
wrapping(int d,int coord,int size,int tx_wrap,int tx_linear)1024 void GGLAssembler::wrapping(
1025             int d,
1026             int coord, int size,
1027             int tx_wrap, int tx_linear)
1028 {
1029     // notes:
1030     // if tx_linear is set, we need 4 extra bits of precision on the result
1031     // SMULL/UMULL is 3 cycles
1032     Scratch scratches(registerFile());
1033     int c = coord;
1034     if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
1035         // UMULL takes 4 cycles (interlocked), and we can get away with
1036         // 2 cycles using SMULWB, but we're loosing 16 bits of precision
1037         // out of 32 (this is not a problem because the iterator keeps
1038         // its full precision)
1039         // UMULL(AL, 0, size, d, c, size);
1040         // note: we can't use SMULTB because it's signed.
1041         MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
1042         SMULWB(AL, d, d, size);
1043     } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
1044         if (tx_linear) {
1045             // 1 cycle
1046             MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
1047         } else {
1048             // 4 cycles (common case)
1049             MOV(AL, 0, d, reg_imm(coord, ASR, 16));
1050             BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
1051             CMP(AL, d, size);
1052             SUB(GE, 0, d, size, imm(1));
1053         }
1054     }
1055 }
1056 
1057 // ---------------------------------------------------------------------------
1058 
modulate(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1059 void GGLAssembler::modulate(
1060         component_t& dest,
1061         const component_t& incoming,
1062         const pixel_t& incomingTexel, int component)
1063 {
1064     Scratch locals(registerFile());
1065     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1066     extract(texel, incomingTexel, component);
1067 
1068     const int Nt = texel.size();
1069         // Nt should always be less than 10 bits because it comes
1070         // from the TMU.
1071 
1072     int Ni = incoming.size();
1073         // Ni could be big because it comes from previous MODULATEs
1074 
1075     if (Nt == 1) {
1076         // texel acts as a bit-mask
1077         // dest = incoming & ((texel << incoming.h)-texel)
1078         RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
1079         AND(AL, 0, dest.reg, dest.reg, incoming.reg);
1080         dest.l = incoming.l;
1081         dest.h = incoming.h;
1082         dest.flags |= (incoming.flags & CLEAR_LO);
1083     } else if (Ni == 1) {
1084         MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
1085         AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
1086         dest.l = 0;
1087         dest.h = Nt;
1088     } else {
1089         int inReg = incoming.reg;
1090         int shift = incoming.l;
1091         if ((Nt + Ni) > 32) {
1092             // we will overflow, reduce the precision of Ni to 8 bits
1093             // (Note Nt cannot be more than 10 bits which happens with
1094             // 565 textures and GGL_LINEAR)
1095             shift += Ni-8;
1096             Ni = 8;
1097         }
1098 
1099         // modulate by the component with the lowest precision
1100         if (Nt >= Ni) {
1101             if (shift) {
1102                 // XXX: we should be able to avoid this shift
1103                 // when shift==16 && Nt<16 && Ni<16, in which
1104                 // we could use SMULBT below.
1105                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1106                 inReg = dest.reg;
1107                 shift = 0;
1108             }
1109             // operation:           (Cf*Ct)/((1<<Ni)-1)
1110             // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
1111             // this operation doesn't change texel's size
1112             ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
1113             if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
1114             else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
1115             dest.l = Ni;
1116             dest.h = Nt + Ni;
1117         } else {
1118             if (shift && (shift != 16)) {
1119                 // if shift==16, we can use 16-bits mul instructions later
1120                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1121                 inReg = dest.reg;
1122                 shift = 0;
1123             }
1124             // operation:           (Cf*Ct)/((1<<Nt)-1)
1125             // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
1126             // this operation doesn't change incoming's size
1127             Scratch scratches(registerFile());
1128             int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
1129             if (t == inReg)
1130                 t = scratches.obtain();
1131             ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
1132             if (Nt<16 && Ni<16) {
1133                 if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
1134                 else            SMULBB(AL, dest.reg, t, inReg);
1135             } else              MUL(AL, 0, dest.reg, t, inReg);
1136             dest.l = Nt;
1137             dest.h = Nt + Ni;
1138         }
1139 
1140         // low bits are not valid
1141         dest.flags |= CLEAR_LO;
1142 
1143         // no need to keep more than 8 bits/component
1144         if (dest.size() > 8)
1145             dest.l = dest.h-8;
1146     }
1147 }
1148 
decal(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1149 void GGLAssembler::decal(
1150         component_t& dest,
1151         const component_t& incoming,
1152         const pixel_t& incomingTexel, int component)
1153 {
1154     // RGBA:
1155     // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
1156     // Av = Af
1157     Scratch locals(registerFile());
1158     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1159     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1160     extract(texel, incomingTexel, component);
1161     extract(factor, incomingTexel, GGLFormat::ALPHA);
1162 
1163     // no need to keep more than 8-bits for decal
1164     int Ni = incoming.size();
1165     int shift = incoming.l;
1166     if (Ni > 8) {
1167         shift += Ni-8;
1168         Ni = 8;
1169     }
1170     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1171     if (shift) {
1172         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1173         incomingNorm.reg = dest.reg;
1174         incomingNorm.flags |= CORRUPTIBLE;
1175     }
1176     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1177     build_blendOneMinusFF(dest, factor, incomingNorm, texel);
1178 }
1179 
blend(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component,int tmu)1180 void GGLAssembler::blend(
1181         component_t& dest,
1182         const component_t& incoming,
1183         const pixel_t& incomingTexel, int component, int tmu)
1184 {
1185     // RGBA:
1186     // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
1187     // Av = At*Af
1188 
1189     if (component == GGLFormat::ALPHA) {
1190         modulate(dest, incoming, incomingTexel, component);
1191         return;
1192     }
1193 
1194     Scratch locals(registerFile());
1195     integer_t color(locals.obtain(), 8, CORRUPTIBLE);
1196     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1197     LDRB(AL, color.reg, mBuilderContext.Rctx,
1198             immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
1199     extract(factor, incomingTexel, component);
1200 
1201     // no need to keep more than 8-bits for blend
1202     int Ni = incoming.size();
1203     int shift = incoming.l;
1204     if (Ni > 8) {
1205         shift += Ni-8;
1206         Ni = 8;
1207     }
1208     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1209     if (shift) {
1210         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1211         incomingNorm.reg = dest.reg;
1212         incomingNorm.flags |= CORRUPTIBLE;
1213     }
1214     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1215     build_blendOneMinusFF(dest, factor, incomingNorm, color);
1216 }
1217 
add(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1218 void GGLAssembler::add(
1219         component_t& dest,
1220         const component_t& incoming,
1221         const pixel_t& incomingTexel, int component)
1222 {
1223     // RGBA:
1224     // Cv = Cf + Ct;
1225     Scratch locals(registerFile());
1226 
1227     component_t incomingTemp(incoming);
1228 
1229     // use "dest" as a temporary for extracting the texel, unless "dest"
1230     // overlaps "incoming".
1231     integer_t texel(dest.reg, 32, CORRUPTIBLE);
1232     if (dest.reg == incomingTemp.reg)
1233         texel.reg = locals.obtain();
1234     extract(texel, incomingTexel, component);
1235 
1236     if (texel.s < incomingTemp.size()) {
1237         expand(texel, texel, incomingTemp.size());
1238     } else if (texel.s > incomingTemp.size()) {
1239         if (incomingTemp.flags & CORRUPTIBLE) {
1240             expand(incomingTemp, incomingTemp, texel.s);
1241         } else {
1242             incomingTemp.reg = locals.obtain();
1243             expand(incomingTemp, incoming, texel.s);
1244         }
1245     }
1246 
1247     if (incomingTemp.l) {
1248         ADD(AL, 0, dest.reg, texel.reg,
1249                 reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
1250     } else {
1251         ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
1252     }
1253     dest.l = 0;
1254     dest.h = texel.size();
1255     component_sat(dest);
1256 }
1257 
1258 // ----------------------------------------------------------------------------
1259 
1260 }; // namespace android
1261 
1262