1 /*
2  * Mesa 3-D graphics library
3  *
4  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 /**
26  * \file prog_execute.c
27  * Software interpreter for vertex/fragment programs.
28  * \author Brian Paul
29  */
30 
31 /*
32  * NOTE: we do everything in single-precision floating point; we don't
33  * currently observe the single/half/fixed-precision qualifiers.
34  *
35  */
36 
37 
38 #include "c99_math.h"
39 #include "main/errors.h"
40 #include "main/glheader.h"
41 #include "main/macros.h"
42 #include "main/mtypes.h"
43 #include "prog_execute.h"
44 #include "prog_instruction.h"
45 #include "prog_parameter.h"
46 #include "prog_print.h"
47 #include "prog_noise.h"
48 
49 
50 /* debug predicate */
51 #define DEBUG_PROG 0
52 
53 
54 /**
55  * Set x to positive or negative infinity.
56  */
57 #define SET_POS_INFINITY(x)                  \
58    do {                                      \
59          fi_type fi;                         \
60          fi.i = 0x7F800000;                  \
61          x = fi.f;                           \
62    } while (0)
63 #define SET_NEG_INFINITY(x)                  \
64    do {                                      \
65          fi_type fi;                         \
66          fi.i = 0xFF800000;                  \
67          x = fi.f;                           \
68    } while (0)
69 
70 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
71 
72 
73 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
74 
75 
76 /**
77  * Return a pointer to the 4-element float vector specified by the given
78  * source register.
79  */
80 static inline const GLfloat *
get_src_register_pointer(const struct prog_src_register * source,const struct gl_program_machine * machine)81 get_src_register_pointer(const struct prog_src_register *source,
82                          const struct gl_program_machine *machine)
83 {
84    const struct gl_program *prog = machine->CurProgram;
85    GLint reg = source->Index;
86 
87    if (source->RelAddr) {
88       /* add address register value to src index/offset */
89       reg += machine->AddressReg[0][0];
90       if (reg < 0) {
91          return ZeroVec;
92       }
93    }
94 
95    switch (source->File) {
96    case PROGRAM_TEMPORARY:
97       if (reg >= MAX_PROGRAM_TEMPS)
98          return ZeroVec;
99       return machine->Temporaries[reg];
100 
101    case PROGRAM_INPUT:
102       if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
103          if (reg >= VERT_ATTRIB_MAX)
104             return ZeroVec;
105          return machine->VertAttribs[reg];
106       }
107       else {
108          if (reg >= VARYING_SLOT_MAX)
109             return ZeroVec;
110          return machine->Attribs[reg][machine->CurElement];
111       }
112 
113    case PROGRAM_OUTPUT:
114       if (reg >= MAX_PROGRAM_OUTPUTS)
115          return ZeroVec;
116       return machine->Outputs[reg];
117 
118    case PROGRAM_STATE_VAR:
119       FALLTHROUGH;
120    case PROGRAM_CONSTANT:
121       FALLTHROUGH;
122    case PROGRAM_UNIFORM: {
123       if (reg >= (GLint) prog->Parameters->NumParameters)
124          return ZeroVec;
125 
126       unsigned pvo = prog->Parameters->Parameters[reg].ValueOffset;
127       return (GLfloat *) prog->Parameters->ParameterValues + pvo;
128    }
129    case PROGRAM_SYSTEM_VALUE:
130       assert(reg < (GLint) ARRAY_SIZE(machine->SystemValues));
131       return machine->SystemValues[reg];
132 
133    default:
134       _mesa_problem(NULL,
135          "Invalid src register file %d in get_src_register_pointer()",
136          source->File);
137       return ZeroVec;
138    }
139 }
140 
141 
142 /**
143  * Return a pointer to the 4-element float vector specified by the given
144  * destination register.
145  */
146 static inline GLfloat *
get_dst_register_pointer(const struct prog_dst_register * dest,struct gl_program_machine * machine)147 get_dst_register_pointer(const struct prog_dst_register *dest,
148                          struct gl_program_machine *machine)
149 {
150    static GLfloat dummyReg[4];
151    GLint reg = dest->Index;
152 
153    if (dest->RelAddr) {
154       /* add address register value to src index/offset */
155       reg += machine->AddressReg[0][0];
156       if (reg < 0) {
157          return dummyReg;
158       }
159    }
160 
161    switch (dest->File) {
162    case PROGRAM_TEMPORARY:
163       if (reg >= MAX_PROGRAM_TEMPS)
164          return dummyReg;
165       return machine->Temporaries[reg];
166 
167    case PROGRAM_OUTPUT:
168       if (reg >= MAX_PROGRAM_OUTPUTS)
169          return dummyReg;
170       return machine->Outputs[reg];
171 
172    default:
173       _mesa_problem(NULL,
174          "Invalid dest register file %d in get_dst_register_pointer()",
175          dest->File);
176       return dummyReg;
177    }
178 }
179 
180 
181 
182 /**
183  * Fetch a 4-element float vector from the given source register.
184  * Apply swizzling and negating as needed.
185  */
186 static void
fetch_vector4(const struct prog_src_register * source,const struct gl_program_machine * machine,GLfloat result[4])187 fetch_vector4(const struct prog_src_register *source,
188               const struct gl_program_machine *machine, GLfloat result[4])
189 {
190    const GLfloat *src = get_src_register_pointer(source, machine);
191 
192    if (source->Swizzle == SWIZZLE_NOOP) {
193       /* no swizzling */
194       COPY_4V(result, src);
195    }
196    else {
197       assert(GET_SWZ(source->Swizzle, 0) <= 3);
198       assert(GET_SWZ(source->Swizzle, 1) <= 3);
199       assert(GET_SWZ(source->Swizzle, 2) <= 3);
200       assert(GET_SWZ(source->Swizzle, 3) <= 3);
201       result[0] = src[GET_SWZ(source->Swizzle, 0)];
202       result[1] = src[GET_SWZ(source->Swizzle, 1)];
203       result[2] = src[GET_SWZ(source->Swizzle, 2)];
204       result[3] = src[GET_SWZ(source->Swizzle, 3)];
205    }
206 
207    if (source->Negate) {
208       assert(source->Negate == NEGATE_XYZW);
209       result[0] = -result[0];
210       result[1] = -result[1];
211       result[2] = -result[2];
212       result[3] = -result[3];
213    }
214 
215 #ifdef NAN_CHECK
216    assert(!util_is_inf_or_nan(result[0]));
217    assert(!util_is_inf_or_nan(result[0]));
218    assert(!util_is_inf_or_nan(result[0]));
219    assert(!util_is_inf_or_nan(result[0]));
220 #endif
221 }
222 
223 
224 /**
225  * Fetch the derivative with respect to X or Y for the given register.
226  * XXX this currently only works for fragment program input attribs.
227  */
228 static void
fetch_vector4_deriv(const struct prog_src_register * source,const struct gl_program_machine * machine,char xOrY,GLfloat result[4])229 fetch_vector4_deriv(const struct prog_src_register *source,
230                     const struct gl_program_machine *machine,
231                     char xOrY, GLfloat result[4])
232 {
233    if (source->File == PROGRAM_INPUT &&
234        source->Index < (GLint) machine->NumDeriv) {
235       const GLint col = machine->CurElement;
236       const GLfloat w = machine->Attribs[VARYING_SLOT_POS][col][3];
237       const GLfloat invQ = 1.0f / w;
238       GLfloat deriv[4];
239 
240       if (xOrY == 'X') {
241          deriv[0] = machine->DerivX[source->Index][0] * invQ;
242          deriv[1] = machine->DerivX[source->Index][1] * invQ;
243          deriv[2] = machine->DerivX[source->Index][2] * invQ;
244          deriv[3] = machine->DerivX[source->Index][3] * invQ;
245       }
246       else {
247          deriv[0] = machine->DerivY[source->Index][0] * invQ;
248          deriv[1] = machine->DerivY[source->Index][1] * invQ;
249          deriv[2] = machine->DerivY[source->Index][2] * invQ;
250          deriv[3] = machine->DerivY[source->Index][3] * invQ;
251       }
252 
253       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
254       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
255       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
256       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
257 
258       if (source->Negate) {
259          assert(source->Negate == NEGATE_XYZW);
260          result[0] = -result[0];
261          result[1] = -result[1];
262          result[2] = -result[2];
263          result[3] = -result[3];
264       }
265    }
266    else {
267       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
268    }
269 }
270 
271 
272 /**
273  * As above, but only return result[0] element.
274  */
275 static void
fetch_vector1(const struct prog_src_register * source,const struct gl_program_machine * machine,GLfloat result[4])276 fetch_vector1(const struct prog_src_register *source,
277               const struct gl_program_machine *machine, GLfloat result[4])
278 {
279    const GLfloat *src = get_src_register_pointer(source, machine);
280 
281    result[0] = src[GET_SWZ(source->Swizzle, 0)];
282 
283    if (source->Negate) {
284       result[0] = -result[0];
285    }
286 }
287 
288 
289 /**
290  * Fetch texel from texture.  Use partial derivatives when possible.
291  */
292 static inline void
fetch_texel(struct gl_context * ctx,const struct gl_program_machine * machine,const struct prog_instruction * inst,const GLfloat texcoord[4],GLfloat lodBias,GLfloat color[4])293 fetch_texel(struct gl_context *ctx,
294             const struct gl_program_machine *machine,
295             const struct prog_instruction *inst,
296             const GLfloat texcoord[4], GLfloat lodBias,
297             GLfloat color[4])
298 {
299    const GLuint unit = machine->Samplers[inst->TexSrcUnit];
300 
301    /* Note: we only have the right derivatives for fragment input attribs.
302     */
303    if (machine->NumDeriv > 0 &&
304        inst->SrcReg[0].File == PROGRAM_INPUT &&
305        inst->SrcReg[0].Index == VARYING_SLOT_TEX0 + inst->TexSrcUnit) {
306       /* simple texture fetch for which we should have derivatives */
307       GLuint attr = inst->SrcReg[0].Index;
308       machine->FetchTexelDeriv(ctx, texcoord,
309                                machine->DerivX[attr],
310                                machine->DerivY[attr],
311                                lodBias, unit, color);
312    }
313    else {
314       machine->FetchTexelLod(ctx, texcoord, lodBias, unit, color);
315    }
316 }
317 
318 
319 /**
320  * Store 4 floats into a register.  Observe the instructions saturate and
321  * set-condition-code flags.
322  */
323 static void
store_vector4(const struct prog_instruction * inst,struct gl_program_machine * machine,const GLfloat value[4])324 store_vector4(const struct prog_instruction *inst,
325               struct gl_program_machine *machine, const GLfloat value[4])
326 {
327    const struct prog_dst_register *dstReg = &(inst->DstReg);
328    const GLboolean clamp = inst->Saturate;
329    GLuint writeMask = dstReg->WriteMask;
330    GLfloat clampedValue[4];
331    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
332 
333 #if 0
334    if (value[0] > 1.0e10 ||
335        util_is_inf_or_nan(value[0]) ||
336        util_is_inf_or_nan(value[1]) ||
337        util_is_inf_or_nan(value[2]) || util_is_inf_or_nan(value[3]))
338       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
339 #endif
340 
341    if (clamp) {
342       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
343       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
344       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
345       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
346       value = clampedValue;
347    }
348 
349 #ifdef NAN_CHECK
350    assert(!util_is_inf_or_nan(value[0]));
351    assert(!util_is_inf_or_nan(value[0]));
352    assert(!util_is_inf_or_nan(value[0]));
353    assert(!util_is_inf_or_nan(value[0]));
354 #endif
355 
356    if (writeMask & WRITEMASK_X)
357       dst[0] = value[0];
358    if (writeMask & WRITEMASK_Y)
359       dst[1] = value[1];
360    if (writeMask & WRITEMASK_Z)
361       dst[2] = value[2];
362    if (writeMask & WRITEMASK_W)
363       dst[3] = value[3];
364 }
365 
366 
367 /**
368  * Execute the given vertex/fragment program.
369  *
370  * \param ctx  rendering context
371  * \param program  the program to execute
372  * \param machine  machine state (must be initialized)
373  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
374  */
375 GLboolean
_mesa_execute_program(struct gl_context * ctx,const struct gl_program * program,struct gl_program_machine * machine)376 _mesa_execute_program(struct gl_context * ctx,
377                       const struct gl_program *program,
378                       struct gl_program_machine *machine)
379 {
380    const GLuint numInst = program->arb.NumInstructions;
381    const GLuint maxExec = 65536;
382    GLuint pc, numExec = 0;
383 
384    machine->CurProgram = program;
385 
386    if (DEBUG_PROG) {
387       printf("execute program %u --------------------\n", program->Id);
388    }
389 
390    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
391       machine->EnvParams = ctx->VertexProgram.Parameters;
392    }
393    else {
394       machine->EnvParams = ctx->FragmentProgram.Parameters;
395    }
396 
397    for (pc = 0; pc < numInst; pc++) {
398       const struct prog_instruction *inst = program->arb.Instructions + pc;
399 
400       if (DEBUG_PROG) {
401          _mesa_print_instruction(inst);
402       }
403 
404       switch (inst->Opcode) {
405       case OPCODE_ABS:
406          {
407             GLfloat a[4], result[4];
408             fetch_vector4(&inst->SrcReg[0], machine, a);
409             result[0] = fabsf(a[0]);
410             result[1] = fabsf(a[1]);
411             result[2] = fabsf(a[2]);
412             result[3] = fabsf(a[3]);
413             store_vector4(inst, machine, result);
414          }
415          break;
416       case OPCODE_ADD:
417          {
418             GLfloat a[4], b[4], result[4];
419             fetch_vector4(&inst->SrcReg[0], machine, a);
420             fetch_vector4(&inst->SrcReg[1], machine, b);
421             result[0] = a[0] + b[0];
422             result[1] = a[1] + b[1];
423             result[2] = a[2] + b[2];
424             result[3] = a[3] + b[3];
425             store_vector4(inst, machine, result);
426             if (DEBUG_PROG) {
427                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
428                       result[0], result[1], result[2], result[3],
429                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
430             }
431          }
432          break;
433       case OPCODE_ARL:
434          {
435             GLfloat t[4];
436             fetch_vector4(&inst->SrcReg[0], machine, t);
437             machine->AddressReg[0][0] = util_ifloor(t[0]);
438             if (DEBUG_PROG) {
439                printf("ARL %d\n", machine->AddressReg[0][0]);
440             }
441          }
442          break;
443       case OPCODE_BGNLOOP:
444          /* no-op */
445          assert(program->arb.Instructions[inst->BranchTarget].Opcode
446                 == OPCODE_ENDLOOP);
447          break;
448       case OPCODE_ENDLOOP:
449          /* subtract 1 here since pc is incremented by for(pc) loop */
450          assert(program->arb.Instructions[inst->BranchTarget].Opcode
451                 == OPCODE_BGNLOOP);
452          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
453          break;
454       case OPCODE_BGNSUB:      /* begin subroutine */
455          break;
456       case OPCODE_ENDSUB:      /* end subroutine */
457          break;
458       case OPCODE_BRK:         /* break out of loop (conditional) */
459          assert(program->arb.Instructions[inst->BranchTarget].Opcode
460                 == OPCODE_ENDLOOP);
461          /* break out of loop */
462          /* pc++ at end of for-loop will put us after the ENDLOOP inst */
463          pc = inst->BranchTarget;
464          break;
465       case OPCODE_CONT:        /* continue loop (conditional) */
466          assert(program->arb.Instructions[inst->BranchTarget].Opcode
467                 == OPCODE_ENDLOOP);
468          /* continue at ENDLOOP */
469          /* Subtract 1 here since we'll do pc++ at end of for-loop */
470          pc = inst->BranchTarget - 1;
471          break;
472       case OPCODE_CAL:         /* Call subroutine (conditional) */
473          /* call the subroutine */
474          if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
475             return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
476          }
477          machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
478          /* Subtract 1 here since we'll do pc++ at end of for-loop */
479          pc = inst->BranchTarget - 1;
480          break;
481       case OPCODE_CMP:
482          {
483             GLfloat a[4], b[4], c[4], result[4];
484             fetch_vector4(&inst->SrcReg[0], machine, a);
485             fetch_vector4(&inst->SrcReg[1], machine, b);
486             fetch_vector4(&inst->SrcReg[2], machine, c);
487             result[0] = a[0] < 0.0F ? b[0] : c[0];
488             result[1] = a[1] < 0.0F ? b[1] : c[1];
489             result[2] = a[2] < 0.0F ? b[2] : c[2];
490             result[3] = a[3] < 0.0F ? b[3] : c[3];
491             store_vector4(inst, machine, result);
492             if (DEBUG_PROG) {
493                printf("CMP (%g %g %g %g) = (%g %g %g %g) < 0 ? (%g %g %g %g) : (%g %g %g %g)\n",
494                       result[0], result[1], result[2], result[3],
495                       a[0], a[1], a[2], a[3],
496                       b[0], b[1], b[2], b[3],
497                       c[0], c[1], c[2], c[3]);
498             }
499          }
500          break;
501       case OPCODE_COS:
502          {
503             GLfloat a[4], result[4];
504             fetch_vector1(&inst->SrcReg[0], machine, a);
505             result[0] = result[1] = result[2] = result[3]
506                = cosf(a[0]);
507             store_vector4(inst, machine, result);
508          }
509          break;
510       case OPCODE_DDX:         /* Partial derivative with respect to X */
511          {
512             GLfloat result[4];
513             fetch_vector4_deriv(&inst->SrcReg[0], machine, 'X', result);
514             store_vector4(inst, machine, result);
515          }
516          break;
517       case OPCODE_DDY:         /* Partial derivative with respect to Y */
518          {
519             GLfloat result[4];
520             fetch_vector4_deriv(&inst->SrcReg[0], machine, 'Y', result);
521             store_vector4(inst, machine, result);
522          }
523          break;
524       case OPCODE_DP2:
525          {
526             GLfloat a[4], b[4], result[4];
527             fetch_vector4(&inst->SrcReg[0], machine, a);
528             fetch_vector4(&inst->SrcReg[1], machine, b);
529             result[0] = result[1] = result[2] = result[3] = DOT2(a, b);
530             store_vector4(inst, machine, result);
531             if (DEBUG_PROG) {
532                printf("DP2 %g = (%g %g) . (%g %g)\n",
533                       result[0], a[0], a[1], b[0], b[1]);
534             }
535          }
536          break;
537       case OPCODE_DP3:
538          {
539             GLfloat a[4], b[4], result[4];
540             fetch_vector4(&inst->SrcReg[0], machine, a);
541             fetch_vector4(&inst->SrcReg[1], machine, b);
542             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
543             store_vector4(inst, machine, result);
544             if (DEBUG_PROG) {
545                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
546                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
547             }
548          }
549          break;
550       case OPCODE_DP4:
551          {
552             GLfloat a[4], b[4], result[4];
553             fetch_vector4(&inst->SrcReg[0], machine, a);
554             fetch_vector4(&inst->SrcReg[1], machine, b);
555             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
556             store_vector4(inst, machine, result);
557             if (DEBUG_PROG) {
558                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
559                       result[0], a[0], a[1], a[2], a[3],
560                       b[0], b[1], b[2], b[3]);
561             }
562          }
563          break;
564       case OPCODE_DPH:
565          {
566             GLfloat a[4], b[4], result[4];
567             fetch_vector4(&inst->SrcReg[0], machine, a);
568             fetch_vector4(&inst->SrcReg[1], machine, b);
569             result[0] = result[1] = result[2] = result[3] = DOT3(a, b) + b[3];
570             store_vector4(inst, machine, result);
571          }
572          break;
573       case OPCODE_DST:         /* Distance vector */
574          {
575             GLfloat a[4], b[4], result[4];
576             fetch_vector4(&inst->SrcReg[0], machine, a);
577             fetch_vector4(&inst->SrcReg[1], machine, b);
578             result[0] = 1.0F;
579             result[1] = a[1] * b[1];
580             result[2] = a[2];
581             result[3] = b[3];
582             store_vector4(inst, machine, result);
583          }
584          break;
585       case OPCODE_EXP:
586          {
587             GLfloat t[4], q[4], floor_t0;
588             fetch_vector1(&inst->SrcReg[0], machine, t);
589             floor_t0 = floorf(t[0]);
590             if (floor_t0 > FLT_MAX_EXP) {
591                SET_POS_INFINITY(q[0]);
592                SET_POS_INFINITY(q[2]);
593             }
594             else if (floor_t0 < FLT_MIN_EXP) {
595                q[0] = 0.0F;
596                q[2] = 0.0F;
597             }
598             else {
599                q[0] = ldexpf(1.0, (int) floor_t0);
600                /* Note: GL_NV_vertex_program expects
601                 * result.z = result.x * APPX(result.y)
602                 * We do what the ARB extension says.
603                 */
604                q[2] = exp2f(t[0]);
605             }
606             q[1] = t[0] - floor_t0;
607             q[3] = 1.0F;
608             store_vector4( inst, machine, q );
609          }
610          break;
611       case OPCODE_EX2:         /* Exponential base 2 */
612          {
613             GLfloat a[4], result[4], val;
614             fetch_vector1(&inst->SrcReg[0], machine, a);
615             val = exp2f(a[0]);
616             /*
617             if (util_is_inf_or_nan(val))
618                val = 1.0e10;
619             */
620             result[0] = result[1] = result[2] = result[3] = val;
621             store_vector4(inst, machine, result);
622          }
623          break;
624       case OPCODE_FLR:
625          {
626             GLfloat a[4], result[4];
627             fetch_vector4(&inst->SrcReg[0], machine, a);
628             result[0] = floorf(a[0]);
629             result[1] = floorf(a[1]);
630             result[2] = floorf(a[2]);
631             result[3] = floorf(a[3]);
632             store_vector4(inst, machine, result);
633          }
634          break;
635       case OPCODE_FRC:
636          {
637             GLfloat a[4], result[4];
638             fetch_vector4(&inst->SrcReg[0], machine, a);
639             result[0] = a[0] - floorf(a[0]);
640             result[1] = a[1] - floorf(a[1]);
641             result[2] = a[2] - floorf(a[2]);
642             result[3] = a[3] - floorf(a[3]);
643             store_vector4(inst, machine, result);
644          }
645          break;
646       case OPCODE_IF:
647          {
648             GLboolean cond;
649             assert(program->arb.Instructions[inst->BranchTarget].Opcode
650                    == OPCODE_ELSE ||
651                    program->arb.Instructions[inst->BranchTarget].Opcode
652                    == OPCODE_ENDIF);
653             /* eval condition */
654             GLfloat a[4];
655             fetch_vector1(&inst->SrcReg[0], machine, a);
656             cond = (a[0] != 0.0F);
657             if (DEBUG_PROG) {
658                printf("IF: %d\n", cond);
659             }
660             /* do if/else */
661             if (cond) {
662                /* do if-clause (just continue execution) */
663             }
664             else {
665                /* go to the instruction after ELSE or ENDIF */
666                assert(inst->BranchTarget >= 0);
667                pc = inst->BranchTarget;
668             }
669          }
670          break;
671       case OPCODE_ELSE:
672          /* goto ENDIF */
673          assert(program->arb.Instructions[inst->BranchTarget].Opcode
674                 == OPCODE_ENDIF);
675          assert(inst->BranchTarget >= 0);
676          pc = inst->BranchTarget;
677          break;
678       case OPCODE_ENDIF:
679          /* nothing */
680          break;
681       case OPCODE_KIL:         /* ARB_f_p only */
682          {
683             GLfloat a[4];
684             fetch_vector4(&inst->SrcReg[0], machine, a);
685             if (DEBUG_PROG) {
686                printf("KIL if (%g %g %g %g) <= 0.0\n",
687                       a[0], a[1], a[2], a[3]);
688             }
689 
690             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
691                return GL_FALSE;
692             }
693          }
694          break;
695       case OPCODE_LG2:         /* log base 2 */
696          {
697             GLfloat a[4], result[4], val;
698             fetch_vector1(&inst->SrcReg[0], machine, a);
699 	    /* The fast LOG2 macro doesn't meet the precision requirements.
700 	     */
701             if (a[0] == 0.0F) {
702                val = -FLT_MAX;
703             }
704             else {
705                val = logf(a[0]) * 1.442695F;
706             }
707             result[0] = result[1] = result[2] = result[3] = val;
708             store_vector4(inst, machine, result);
709          }
710          break;
711       case OPCODE_LIT:
712          {
713             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
714             GLfloat a[4], result[4];
715             fetch_vector4(&inst->SrcReg[0], machine, a);
716             a[0] = MAX2(a[0], 0.0F);
717             a[1] = MAX2(a[1], 0.0F);
718             /* XXX ARB version clamps a[3], NV version doesn't */
719             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
720             result[0] = 1.0F;
721             result[1] = a[0];
722             /* XXX we could probably just use pow() here */
723             if (a[0] > 0.0F) {
724                if (a[1] == 0.0F && a[3] == 0.0F)
725                   result[2] = 1.0F;
726                else
727                   result[2] = powf(a[1], a[3]);
728             }
729             else {
730                result[2] = 0.0F;
731             }
732             result[3] = 1.0F;
733             store_vector4(inst, machine, result);
734             if (DEBUG_PROG) {
735                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
736                       result[0], result[1], result[2], result[3],
737                       a[0], a[1], a[2], a[3]);
738             }
739          }
740          break;
741       case OPCODE_LOG:
742          {
743             GLfloat t[4], q[4], abs_t0;
744             fetch_vector1(&inst->SrcReg[0], machine, t);
745             abs_t0 = fabsf(t[0]);
746             if (abs_t0 != 0.0F) {
747                if (util_is_inf_or_nan(abs_t0))
748                {
749                   SET_POS_INFINITY(q[0]);
750                   q[1] = 1.0F;
751                   SET_POS_INFINITY(q[2]);
752                }
753                else {
754                   int exponent;
755                   GLfloat mantissa = frexpf(t[0], &exponent);
756                   q[0] = (GLfloat) (exponent - 1);
757                   q[1] = 2.0F * mantissa; /* map [.5, 1) -> [1, 2) */
758 
759 		  /* The fast LOG2 macro doesn't meet the precision
760 		   * requirements.
761 		   */
762                   q[2] = logf(t[0]) * 1.442695F;
763                }
764             }
765             else {
766                SET_NEG_INFINITY(q[0]);
767                q[1] = 1.0F;
768                SET_NEG_INFINITY(q[2]);
769             }
770             q[3] = 1.0;
771             store_vector4(inst, machine, q);
772          }
773          break;
774       case OPCODE_LRP:
775          {
776             GLfloat a[4], b[4], c[4], result[4];
777             fetch_vector4(&inst->SrcReg[0], machine, a);
778             fetch_vector4(&inst->SrcReg[1], machine, b);
779             fetch_vector4(&inst->SrcReg[2], machine, c);
780             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
781             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
782             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
783             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
784             store_vector4(inst, machine, result);
785             if (DEBUG_PROG) {
786                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
787                       "(%g %g %g %g), (%g %g %g %g)\n",
788                       result[0], result[1], result[2], result[3],
789                       a[0], a[1], a[2], a[3],
790                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
791             }
792          }
793          break;
794       case OPCODE_MAD:
795          {
796             GLfloat a[4], b[4], c[4], result[4];
797             fetch_vector4(&inst->SrcReg[0], machine, a);
798             fetch_vector4(&inst->SrcReg[1], machine, b);
799             fetch_vector4(&inst->SrcReg[2], machine, c);
800             result[0] = a[0] * b[0] + c[0];
801             result[1] = a[1] * b[1] + c[1];
802             result[2] = a[2] * b[2] + c[2];
803             result[3] = a[3] * b[3] + c[3];
804             store_vector4(inst, machine, result);
805             if (DEBUG_PROG) {
806                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
807                       "(%g %g %g %g) + (%g %g %g %g)\n",
808                       result[0], result[1], result[2], result[3],
809                       a[0], a[1], a[2], a[3],
810                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
811             }
812          }
813          break;
814       case OPCODE_MAX:
815          {
816             GLfloat a[4], b[4], result[4];
817             fetch_vector4(&inst->SrcReg[0], machine, a);
818             fetch_vector4(&inst->SrcReg[1], machine, b);
819             result[0] = MAX2(a[0], b[0]);
820             result[1] = MAX2(a[1], b[1]);
821             result[2] = MAX2(a[2], b[2]);
822             result[3] = MAX2(a[3], b[3]);
823             store_vector4(inst, machine, result);
824             if (DEBUG_PROG) {
825                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
826                       result[0], result[1], result[2], result[3],
827                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
828             }
829          }
830          break;
831       case OPCODE_MIN:
832          {
833             GLfloat a[4], b[4], result[4];
834             fetch_vector4(&inst->SrcReg[0], machine, a);
835             fetch_vector4(&inst->SrcReg[1], machine, b);
836             result[0] = MIN2(a[0], b[0]);
837             result[1] = MIN2(a[1], b[1]);
838             result[2] = MIN2(a[2], b[2]);
839             result[3] = MIN2(a[3], b[3]);
840             store_vector4(inst, machine, result);
841          }
842          break;
843       case OPCODE_MOV:
844          {
845             GLfloat result[4];
846             fetch_vector4(&inst->SrcReg[0], machine, result);
847             store_vector4(inst, machine, result);
848             if (DEBUG_PROG) {
849                printf("MOV (%g %g %g %g)\n",
850                       result[0], result[1], result[2], result[3]);
851             }
852          }
853          break;
854       case OPCODE_MUL:
855          {
856             GLfloat a[4], b[4], result[4];
857             fetch_vector4(&inst->SrcReg[0], machine, a);
858             fetch_vector4(&inst->SrcReg[1], machine, b);
859             result[0] = a[0] * b[0];
860             result[1] = a[1] * b[1];
861             result[2] = a[2] * b[2];
862             result[3] = a[3] * b[3];
863             store_vector4(inst, machine, result);
864             if (DEBUG_PROG) {
865                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
866                       result[0], result[1], result[2], result[3],
867                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
868             }
869          }
870          break;
871       case OPCODE_NOISE1:
872          {
873             GLfloat a[4], result[4];
874             fetch_vector1(&inst->SrcReg[0], machine, a);
875             result[0] =
876                result[1] =
877                result[2] =
878                result[3] = _mesa_noise1(a[0]);
879             store_vector4(inst, machine, result);
880          }
881          break;
882       case OPCODE_NOISE2:
883          {
884             GLfloat a[4], result[4];
885             fetch_vector4(&inst->SrcReg[0], machine, a);
886             result[0] =
887                result[1] =
888                result[2] = result[3] = _mesa_noise2(a[0], a[1]);
889             store_vector4(inst, machine, result);
890          }
891          break;
892       case OPCODE_NOISE3:
893          {
894             GLfloat a[4], result[4];
895             fetch_vector4(&inst->SrcReg[0], machine, a);
896             result[0] =
897                result[1] =
898                result[2] =
899                result[3] = _mesa_noise3(a[0], a[1], a[2]);
900             store_vector4(inst, machine, result);
901          }
902          break;
903       case OPCODE_NOISE4:
904          {
905             GLfloat a[4], result[4];
906             fetch_vector4(&inst->SrcReg[0], machine, a);
907             result[0] =
908                result[1] =
909                result[2] =
910                result[3] = _mesa_noise4(a[0], a[1], a[2], a[3]);
911             store_vector4(inst, machine, result);
912          }
913          break;
914       case OPCODE_NOP:
915          break;
916       case OPCODE_POW:
917          {
918             GLfloat a[4], b[4], result[4];
919             fetch_vector1(&inst->SrcReg[0], machine, a);
920             fetch_vector1(&inst->SrcReg[1], machine, b);
921             result[0] = result[1] = result[2] = result[3]
922                = powf(a[0], b[0]);
923             store_vector4(inst, machine, result);
924          }
925          break;
926 
927       case OPCODE_RCP:
928          {
929             GLfloat a[4], result[4];
930             fetch_vector1(&inst->SrcReg[0], machine, a);
931             if (DEBUG_PROG) {
932                if (a[0] == 0)
933                   printf("RCP(0)\n");
934                else if (util_is_inf_or_nan(a[0]))
935                   printf("RCP(inf)\n");
936             }
937             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
938             store_vector4(inst, machine, result);
939          }
940          break;
941       case OPCODE_RET:         /* return from subroutine (conditional) */
942          if (machine->StackDepth == 0) {
943             return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
944          }
945          /* subtract one because of pc++ in the for loop */
946          pc = machine->CallStack[--machine->StackDepth] - 1;
947          break;
948       case OPCODE_RSQ:         /* 1 / sqrt() */
949          {
950             GLfloat a[4], result[4];
951             fetch_vector1(&inst->SrcReg[0], machine, a);
952             a[0] = fabsf(a[0]);
953             result[0] = result[1] = result[2] = result[3] = 1.0f / sqrtf(a[0]);
954             store_vector4(inst, machine, result);
955             if (DEBUG_PROG) {
956                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
957             }
958          }
959          break;
960       case OPCODE_SCS:         /* sine and cos */
961          {
962             GLfloat a[4], result[4];
963             fetch_vector1(&inst->SrcReg[0], machine, a);
964             result[0] = cosf(a[0]);
965             result[1] = sinf(a[0]);
966             result[2] = 0.0F;    /* undefined! */
967             result[3] = 0.0F;    /* undefined! */
968             store_vector4(inst, machine, result);
969          }
970          break;
971       case OPCODE_SGE:         /* set on greater or equal */
972          {
973             GLfloat a[4], b[4], result[4];
974             fetch_vector4(&inst->SrcReg[0], machine, a);
975             fetch_vector4(&inst->SrcReg[1], machine, b);
976             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
977             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
978             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
979             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
980             store_vector4(inst, machine, result);
981             if (DEBUG_PROG) {
982                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
983                       result[0], result[1], result[2], result[3],
984                       a[0], a[1], a[2], a[3],
985                       b[0], b[1], b[2], b[3]);
986             }
987          }
988          break;
989       case OPCODE_SIN:
990          {
991             GLfloat a[4], result[4];
992             fetch_vector1(&inst->SrcReg[0], machine, a);
993             result[0] = result[1] = result[2] = result[3]
994                = sinf(a[0]);
995             store_vector4(inst, machine, result);
996          }
997          break;
998       case OPCODE_SLT:         /* set on less */
999          {
1000             GLfloat a[4], b[4], result[4];
1001             fetch_vector4(&inst->SrcReg[0], machine, a);
1002             fetch_vector4(&inst->SrcReg[1], machine, b);
1003             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1004             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1005             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1006             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1007             store_vector4(inst, machine, result);
1008             if (DEBUG_PROG) {
1009                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1010                       result[0], result[1], result[2], result[3],
1011                       a[0], a[1], a[2], a[3],
1012                       b[0], b[1], b[2], b[3]);
1013             }
1014          }
1015          break;
1016       case OPCODE_SSG:         /* set sign (-1, 0 or +1) */
1017          {
1018             GLfloat a[4], result[4];
1019             fetch_vector4(&inst->SrcReg[0], machine, a);
1020             result[0] = (GLfloat) ((a[0] > 0.0F) - (a[0] < 0.0F));
1021             result[1] = (GLfloat) ((a[1] > 0.0F) - (a[1] < 0.0F));
1022             result[2] = (GLfloat) ((a[2] > 0.0F) - (a[2] < 0.0F));
1023             result[3] = (GLfloat) ((a[3] > 0.0F) - (a[3] < 0.0F));
1024             store_vector4(inst, machine, result);
1025          }
1026          break;
1027       case OPCODE_SUB:
1028          {
1029             GLfloat a[4], b[4], result[4];
1030             fetch_vector4(&inst->SrcReg[0], machine, a);
1031             fetch_vector4(&inst->SrcReg[1], machine, b);
1032             result[0] = a[0] - b[0];
1033             result[1] = a[1] - b[1];
1034             result[2] = a[2] - b[2];
1035             result[3] = a[3] - b[3];
1036             store_vector4(inst, machine, result);
1037             if (DEBUG_PROG) {
1038                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1039                       result[0], result[1], result[2], result[3],
1040                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1041             }
1042          }
1043          break;
1044       case OPCODE_SWZ:         /* extended swizzle */
1045          {
1046             const struct prog_src_register *source = &inst->SrcReg[0];
1047             const GLfloat *src = get_src_register_pointer(source, machine);
1048             GLfloat result[4];
1049             GLuint i;
1050             for (i = 0; i < 4; i++) {
1051                const GLuint swz = GET_SWZ(source->Swizzle, i);
1052                if (swz == SWIZZLE_ZERO)
1053                   result[i] = 0.0;
1054                else if (swz == SWIZZLE_ONE)
1055                   result[i] = 1.0;
1056                else {
1057                   assert(swz <= 3);
1058                   result[i] = src[swz];
1059                }
1060                if (source->Negate & (1 << i))
1061                   result[i] = -result[i];
1062             }
1063             store_vector4(inst, machine, result);
1064          }
1065          break;
1066       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1067          /* Simple texel lookup */
1068          {
1069             GLfloat texcoord[4], color[4];
1070             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1071 
1072             /* For TEX, texcoord.Q should not be used and its value should not
1073              * matter (at most, we pass coord.xyz to texture3D() in GLSL).
1074              * Set Q=1 so that FetchTexelDeriv() doesn't get a garbage value
1075              * which is effectively what happens when the texcoord swizzle
1076              * is .xyzz
1077              */
1078             texcoord[3] = 1.0f;
1079 
1080             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1081 
1082             if (DEBUG_PROG) {
1083                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1084                       color[0], color[1], color[2], color[3],
1085                       inst->TexSrcUnit,
1086                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1087             }
1088             store_vector4(inst, machine, color);
1089          }
1090          break;
1091       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1092          /* Texel lookup with LOD bias */
1093          {
1094             GLfloat texcoord[4], color[4], lodBias;
1095 
1096             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1097 
1098             /* texcoord[3] is the bias to add to lambda */
1099             lodBias = texcoord[3];
1100 
1101             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1102 
1103             if (DEBUG_PROG) {
1104                printf("TXB (%g, %g, %g, %g) = texture[%d][%g %g %g %g]"
1105                       "  bias %g\n",
1106                       color[0], color[1], color[2], color[3],
1107                       inst->TexSrcUnit,
1108                       texcoord[0],
1109                       texcoord[1],
1110                       texcoord[2],
1111                       texcoord[3],
1112                       lodBias);
1113             }
1114 
1115             store_vector4(inst, machine, color);
1116          }
1117          break;
1118       case OPCODE_TXD:
1119          /* Texture lookup w/ partial derivatives for LOD */
1120          {
1121             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1122             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1123             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1124             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1125             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1126                                      0.0, /* lodBias */
1127                                      inst->TexSrcUnit, color);
1128             store_vector4(inst, machine, color);
1129          }
1130          break;
1131       case OPCODE_TXL:
1132          /* Texel lookup with explicit LOD */
1133          {
1134             GLfloat texcoord[4], color[4], lod;
1135 
1136             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1137 
1138             /* texcoord[3] is the LOD */
1139             lod = texcoord[3];
1140 
1141 	    machine->FetchTexelLod(ctx, texcoord, lod,
1142 				   machine->Samplers[inst->TexSrcUnit], color);
1143 
1144             store_vector4(inst, machine, color);
1145          }
1146          break;
1147       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1148          /* Texture lookup w/ projective divide */
1149          {
1150             GLfloat texcoord[4], color[4];
1151 
1152             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1153             /* Not so sure about this test - if texcoord[3] is
1154              * zero, we'd probably be fine except for an assert in
1155              * IROUND_POS() which gets triggered by the inf values created.
1156              */
1157             if (texcoord[3] != 0.0F) {
1158                texcoord[0] /= texcoord[3];
1159                texcoord[1] /= texcoord[3];
1160                texcoord[2] /= texcoord[3];
1161             }
1162 
1163             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1164 
1165             store_vector4(inst, machine, color);
1166          }
1167          break;
1168       case OPCODE_TRUNC:       /* truncate toward zero */
1169          {
1170             GLfloat a[4], result[4];
1171             fetch_vector4(&inst->SrcReg[0], machine, a);
1172             result[0] = (GLfloat) (GLint) a[0];
1173             result[1] = (GLfloat) (GLint) a[1];
1174             result[2] = (GLfloat) (GLint) a[2];
1175             result[3] = (GLfloat) (GLint) a[3];
1176             store_vector4(inst, machine, result);
1177          }
1178          break;
1179       case OPCODE_XPD:         /* cross product */
1180          {
1181             GLfloat a[4], b[4], result[4];
1182             fetch_vector4(&inst->SrcReg[0], machine, a);
1183             fetch_vector4(&inst->SrcReg[1], machine, b);
1184             result[0] = a[1] * b[2] - a[2] * b[1];
1185             result[1] = a[2] * b[0] - a[0] * b[2];
1186             result[2] = a[0] * b[1] - a[1] * b[0];
1187             result[3] = 1.0;
1188             store_vector4(inst, machine, result);
1189             if (DEBUG_PROG) {
1190                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1191                       result[0], result[1], result[2], result[3],
1192                       a[0], a[1], a[2], b[0], b[1], b[2]);
1193             }
1194          }
1195          break;
1196       case OPCODE_END:
1197          return GL_TRUE;
1198       default:
1199          _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
1200                        inst->Opcode);
1201          return GL_TRUE;        /* return value doesn't matter */
1202       }
1203 
1204       numExec++;
1205       if (numExec > maxExec) {
1206 	 static GLboolean reported = GL_FALSE;
1207 	 if (!reported) {
1208 	    _mesa_problem(ctx, "Infinite loop detected in fragment program");
1209 	    reported = GL_TRUE;
1210 	 }
1211          return GL_TRUE;
1212       }
1213 
1214    } /* for pc */
1215 
1216    return GL_TRUE;
1217 }
1218