1 /*
2  * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22 
23 #include "radeon_compiler.h"
24 
25 #include <stdbool.h>
26 #include <stdio.h>
27 
28 #include "r300_reg.h"
29 
30 #include "radeon_compiler_util.h"
31 #include "radeon_dataflow.h"
32 #include "radeon_program.h"
33 #include "radeon_program_alu.h"
34 #include "radeon_swizzle.h"
35 #include "radeon_emulate_branches.h"
36 #include "radeon_emulate_loops.h"
37 #include "radeon_remove_constants.h"
38 
39 #include "util/compiler.h"
40 
41 /*
42  * Take an already-setup and valid source then swizzle it appropriately to
43  * obtain a constant ZERO or ONE source.
44  */
45 #define __CONST(x, y)	\
46 	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
47 			   t_swizzle(y),	\
48 			   t_swizzle(y),	\
49 			   t_swizzle(y),	\
50 			   t_swizzle(y),	\
51 			   t_src_class(vpi->SrcReg[x].File), \
52 			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
53 
54 
t_dst_mask(unsigned int mask)55 static unsigned long t_dst_mask(unsigned int mask)
56 {
57 	/* RC_MASK_* is equivalent to VSF_FLAG_* */
58 	return mask & RC_MASK_XYZW;
59 }
60 
t_dst_class(rc_register_file file)61 static unsigned long t_dst_class(rc_register_file file)
62 {
63 	switch (file) {
64 	default:
65 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
66 		FALLTHROUGH;
67 	case RC_FILE_TEMPORARY:
68 		return PVS_DST_REG_TEMPORARY;
69 	case RC_FILE_OUTPUT:
70 		return PVS_DST_REG_OUT;
71 	case RC_FILE_ADDRESS:
72 		return PVS_DST_REG_A0;
73 	}
74 }
75 
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)76 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
77 				 struct rc_dst_register *dst)
78 {
79 	if (dst->File == RC_FILE_OUTPUT)
80 		return vp->outputs[dst->Index];
81 
82 	return dst->Index;
83 }
84 
t_src_class(rc_register_file file)85 static unsigned long t_src_class(rc_register_file file)
86 {
87 	switch (file) {
88 	default:
89 		fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
90 		FALLTHROUGH;
91 	case RC_FILE_NONE:
92 	case RC_FILE_TEMPORARY:
93 		return PVS_SRC_REG_TEMPORARY;
94 	case RC_FILE_INPUT:
95 		return PVS_SRC_REG_INPUT;
96 	case RC_FILE_CONSTANT:
97 		return PVS_SRC_REG_CONSTANT;
98 	}
99 }
100 
t_src_conflict(struct rc_src_register a,struct rc_src_register b)101 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
102 {
103 	unsigned long aclass = t_src_class(a.File);
104 	unsigned long bclass = t_src_class(b.File);
105 
106 	if (aclass != bclass)
107 		return 0;
108 	if (aclass == PVS_SRC_REG_TEMPORARY)
109 		return 0;
110 
111 	if (a.RelAddr || b.RelAddr)
112 		return 1;
113 	if (a.Index != b.Index)
114 		return 1;
115 
116 	return 0;
117 }
118 
t_swizzle(unsigned int swizzle)119 static inline unsigned long t_swizzle(unsigned int swizzle)
120 {
121 	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
122 	return swizzle;
123 }
124 
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)125 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
126 				 struct rc_src_register *src)
127 {
128 	if (src->File == RC_FILE_INPUT) {
129 		assert(vp->inputs[src->Index] != -1);
130 		return vp->inputs[src->Index];
131 	} else {
132 		if (src->Index < 0) {
133 			fprintf(stderr,
134 				"negative offsets for indirect addressing do not work.\n");
135 			return 0;
136 		}
137 		return src->Index;
138 	}
139 }
140 
141 /* these two functions should probably be merged... */
142 
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)143 static unsigned long t_src(struct r300_vertex_program_code *vp,
144 			   struct rc_src_register *src)
145 {
146 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
147 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
148 	 */
149 	return PVS_SRC_OPERAND(t_src_index(vp, src),
150 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
151 			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
152 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
153 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
154 			       t_src_class(src->File),
155 			       src->Negate) |
156 	       (src->RelAddr << 4) | (src->Abs << 3);
157 }
158 
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)159 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
160 				  struct rc_src_register *src)
161 {
162 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
163 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
164 	 */
165 	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
166 
167 	return PVS_SRC_OPERAND(t_src_index(vp, src),
168 			       t_swizzle(swz),
169 			       t_swizzle(swz),
170 			       t_swizzle(swz),
171 			       t_swizzle(swz),
172 			       t_src_class(src->File),
173 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
174 	       (src->RelAddr << 4) | (src->Abs << 3);
175 }
176 
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)177 static int valid_dst(struct r300_vertex_program_code *vp,
178 			   struct rc_dst_register *dst)
179 {
180 	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
181 		return 0;
182 	} else if (dst->File == RC_FILE_ADDRESS) {
183 		assert(dst->Index == 0);
184 	}
185 
186 	return 1;
187 }
188 
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 static void ei_vector1(struct r300_vertex_program_code *vp,
190 				unsigned int hw_opcode,
191 				struct rc_sub_instruction *vpi,
192 				unsigned int * inst)
193 {
194 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
195 				     0,
196 				     0,
197 				     t_dst_index(vp, &vpi->DstReg),
198 				     t_dst_mask(vpi->DstReg.WriteMask),
199 				     t_dst_class(vpi->DstReg.File),
200                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
201 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
202 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
203 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
204 }
205 
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)206 static void ei_vector2(struct r300_vertex_program_code *vp,
207 				unsigned int hw_opcode,
208 				struct rc_sub_instruction *vpi,
209 				unsigned int * inst)
210 {
211 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212 				     0,
213 				     0,
214 				     t_dst_index(vp, &vpi->DstReg),
215 				     t_dst_mask(vpi->DstReg.WriteMask),
216 				     t_dst_class(vpi->DstReg.File),
217                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
218 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
219 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
220 	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
221 }
222 
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)223 static void ei_math1(struct r300_vertex_program_code *vp,
224 				unsigned int hw_opcode,
225 				struct rc_sub_instruction *vpi,
226 				unsigned int * inst)
227 {
228 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
229 				     1,
230 				     0,
231 				     t_dst_index(vp, &vpi->DstReg),
232 				     t_dst_mask(vpi->DstReg.WriteMask),
233 				     t_dst_class(vpi->DstReg.File),
234                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
235 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
236 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
237 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
238 }
239 
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)240 static void ei_lit(struct r300_vertex_program_code *vp,
241 				      struct rc_sub_instruction *vpi,
242 				      unsigned int * inst)
243 {
244 	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
245 
246 	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
247 				     1,
248 				     0,
249 				     t_dst_index(vp, &vpi->DstReg),
250 				     t_dst_mask(vpi->DstReg.WriteMask),
251 				     t_dst_class(vpi->DstReg.File),
252                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
253 	/* NOTE: Users swizzling might not work. */
254 	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
255 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
256 				  PVS_SRC_SELECT_FORCE_0,	// Z
257 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
258 				  t_src_class(vpi->SrcReg[0].File),
259 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
260 	    (vpi->SrcReg[0].RelAddr << 4);
261 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
262 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
263 				  PVS_SRC_SELECT_FORCE_0,	// Z
264 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
265 				  t_src_class(vpi->SrcReg[0].File),
266 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
267 	    (vpi->SrcReg[0].RelAddr << 4);
268 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
269 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
270 				  PVS_SRC_SELECT_FORCE_0,	// Z
271 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
272 				  t_src_class(vpi->SrcReg[0].File),
273 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
274 	    (vpi->SrcReg[0].RelAddr << 4);
275 }
276 
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)277 static void ei_mad(struct r300_vertex_program_code *vp,
278 				      struct rc_sub_instruction *vpi,
279 				      unsigned int * inst)
280 {
281 	unsigned int i;
282 	/* Remarks about hardware limitations of MAD
283 	 * (please preserve this comment, as this information is _NOT_
284 	 * in the documentation provided by AMD).
285 	 *
286 	 * As described in the documentation, MAD with three unique temporary
287 	 * source registers requires the use of the macro version.
288 	 *
289 	 * However (and this is not mentioned in the documentation), apparently
290 	 * the macro version is _NOT_ a full superset of the normal version.
291 	 * In particular, the macro version does not always work when relative
292 	 * addressing is used in the source operands.
293 	 *
294 	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
295 	 * assembly shader path when using medium quality animations
296 	 * (i.e. animations with matrix blending instead of quaternion blending).
297 	 *
298 	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
299 	 * test for this issue - for some reason, it is possible to have vertex
300 	 * programs whose prefix is *exactly* the same as the prefix of the
301 	 * offending program in Sauerbraten up to the offending instruction
302 	 * without causing any trouble.
303 	 *
304 	 * Bottom line: Only use the macro version only when really necessary;
305 	 * according to AMD docs, this should improve performance by one clock
306 	 * as a nice side bonus.
307 	 */
308 	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
309 	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
310 	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
311 	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
312 	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
313 	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
314 		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
315 				0,
316 				1,
317 				t_dst_index(vp, &vpi->DstReg),
318 				t_dst_mask(vpi->DstReg.WriteMask),
319 				t_dst_class(vpi->DstReg.File),
320                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
321 	} else {
322 		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
323 				0,
324 				0,
325 				t_dst_index(vp, &vpi->DstReg),
326 				t_dst_mask(vpi->DstReg.WriteMask),
327 				t_dst_class(vpi->DstReg.File),
328                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
329 
330 		/* Arguments with constant swizzles still count as a unique
331 		 * temporary, so we should make sure these arguments share a
332 		 * register index with one of the other arguments. */
333 		for (i = 0; i < 3; i++) {
334 			unsigned int j;
335 			if (vpi->SrcReg[i].File != RC_FILE_NONE)
336 				continue;
337 
338 			for (j = 0; j < 3; j++) {
339 				if (i != j) {
340 					vpi->SrcReg[i].Index =
341 						vpi->SrcReg[j].Index;
342 					break;
343 				}
344 			}
345 		}
346 	}
347 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
348 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
349 	inst[3] = t_src(vp, &vpi->SrcReg[2]);
350 }
351 
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)352 static void ei_pow(struct r300_vertex_program_code *vp,
353 				      struct rc_sub_instruction *vpi,
354 				      unsigned int * inst)
355 {
356 	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
357 				     1,
358 				     0,
359 				     t_dst_index(vp, &vpi->DstReg),
360 				     t_dst_mask(vpi->DstReg.WriteMask),
361 				     t_dst_class(vpi->DstReg.File),
362                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
363 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
364 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
365 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
366 }
367 
translate_vertex_program(struct radeon_compiler * c,void * user)368 static void translate_vertex_program(struct radeon_compiler *c, void *user)
369 {
370 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
371 	struct rc_instruction *rci;
372 
373 	unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
374 	unsigned loop_depth = 0;
375 
376 	compiler->code->pos_end = 0;	/* Not supported yet */
377 	compiler->code->length = 0;
378 	compiler->code->num_temporaries = 0;
379 
380 	compiler->SetHwInputOutput(compiler);
381 
382 	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
383 		struct rc_sub_instruction *vpi = &rci->U.I;
384 		unsigned int *inst = compiler->code->body.d + compiler->code->length;
385 		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
386 
387 		/* Skip instructions writing to non-existing destination */
388 		if (!valid_dst(compiler->code, &vpi->DstReg))
389 			continue;
390 
391 		if (info->HasDstReg) {
392 			/* Neither is Saturate. */
393 			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
394 				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
395 					 "modifier (yet).\n");
396 			}
397 		}
398 
399 		if (compiler->code->length >= c->max_alu_insts * 4) {
400 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
401 			return;
402 		}
403 
404 		assert(compiler->Base.is_r500 ||
405 		       (vpi->Opcode != RC_OPCODE_SEQ &&
406 			vpi->Opcode != RC_OPCODE_SNE));
407 
408 		switch (vpi->Opcode) {
409 		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
410 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
411 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
412 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
413 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
414 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
415 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
416 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
417 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
418 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
419 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
420 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
421 		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
422 		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
423 		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
424 		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
425 		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
426 		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
427 		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
428 		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
429 		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
430 		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
431 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
432 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
433 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
434 		case RC_OPCODE_BGNLOOP:
435 		{
436 			if ((!compiler->Base.is_r500
437 				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
438 				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
439 				rc_error(&compiler->Base,
440 						"Loops are nested too deep.");
441 				return;
442 			}
443 			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
444 			break;
445 		}
446 		case RC_OPCODE_ENDLOOP:
447 		{
448 			unsigned int act_addr;
449 			unsigned int last_addr;
450 			unsigned int ret_addr;
451 
452 			ret_addr = loops[--loop_depth];
453 			act_addr = ret_addr - 1;
454 			last_addr = (compiler->code->length / 4) - 1;
455 
456 			if (loop_depth >= R300_VS_MAX_FC_OPS) {
457 				rc_error(&compiler->Base,
458 					"Too many flow control instructions.");
459 				return;
460 			}
461 			if (compiler->Base.is_r500) {
462 				compiler->code->fc_op_addrs.r500
463 					[compiler->code->num_fc_ops].lw =
464 					R500_PVS_FC_ACT_ADRS(act_addr)
465 					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
466 					;
467 				compiler->code->fc_op_addrs.r500
468 					[compiler->code->num_fc_ops].uw =
469 					R500_PVS_FC_LAST_INST(last_addr)
470 					| R500_PVS_FC_RTN_INST(ret_addr)
471 					;
472 			} else {
473 				compiler->code->fc_op_addrs.r300
474 					[compiler->code->num_fc_ops] =
475 					R300_PVS_FC_ACT_ADRS(act_addr)
476 					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
477 					| R300_PVS_FC_LAST_INST(last_addr)
478 					| R300_PVS_FC_RTN_INST(ret_addr)
479 					;
480 			}
481 			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
482 				R300_PVS_FC_LOOP_INIT_VAL(0x0)
483 				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
484 				;
485 			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
486 						compiler->code->num_fc_ops);
487 			compiler->code->num_fc_ops++;
488 
489 			break;
490 		}
491 
492 		case RC_ME_PRED_SET_CLR:
493 			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
494 			break;
495 
496 		case RC_ME_PRED_SET_INV:
497 			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
498 			break;
499 
500 		case RC_ME_PRED_SET_POP:
501 			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
502 			break;
503 
504 		case RC_ME_PRED_SET_RESTORE:
505 			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
506 			break;
507 
508 		case RC_ME_PRED_SEQ:
509 			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
510 			break;
511 
512 		case RC_ME_PRED_SNEQ:
513 			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
514 			break;
515 
516 		case RC_VE_PRED_SNEQ_PUSH:
517 			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
518 								vpi, inst);
519 			break;
520 
521 		default:
522 			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
523 			return;
524 		}
525 
526 		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
527 			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
528 						<< PVS_DST_PRED_ENABLE_SHIFT);
529 			if (vpi->DstReg.Pred == RC_PRED_SET) {
530 				inst[0] |= (PVS_DST_PRED_SENSE_MASK
531 						<< PVS_DST_PRED_SENSE_SHIFT);
532 			}
533 		}
534 
535 		/* Update the number of temporaries. */
536 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
537 		    vpi->DstReg.Index >= compiler->code->num_temporaries)
538 			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
539 
540 		for (unsigned i = 0; i < info->NumSrcRegs; i++)
541 			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
542 			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
543 				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
544 
545 		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
546 			rc_error(&compiler->Base, "Too many temporaries.\n");
547 			return;
548 		}
549 
550 		compiler->code->length += 4;
551 
552 		if (compiler->Base.Error)
553 			return;
554 	}
555 }
556 
557 struct temporary_allocation {
558 	unsigned int Allocated:1;
559 	unsigned int HwTemp:15;
560 	struct rc_instruction * LastRead;
561 };
562 
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)563 static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
564                    unsigned int orig)
565 {
566     if (!ta[orig].Allocated) {
567         int j;
568         for (j = 0; j < c->max_temp_regs; ++j)
569         {
570             if (!hwtemps[j])
571                 break;
572         }
573         ta[orig].Allocated = 1;
574         ta[orig].HwTemp = j;
575         hwtemps[ta[orig].HwTemp] = true;
576     }
577 
578     return ta[orig].HwTemp;
579 }
580 
allocate_temporary_registers(struct radeon_compiler * c,void * user)581 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
582 {
583 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
584 	struct rc_instruction *inst;
585 	struct rc_instruction *end_loop = NULL;
586 	unsigned int num_orig_temps = 0;
587 	bool hwtemps[RC_REGISTER_MAX_INDEX];
588 	struct temporary_allocation * ta;
589 	unsigned int i;
590 
591 	memset(hwtemps, 0, sizeof(hwtemps));
592 
593 	rc_recompute_ips(c);
594 
595 	/* Pass 1: Count original temporaries. */
596 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
597 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
598 
599 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
600 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
601 				if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
602 					num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
603 			}
604 		}
605 
606 		if (opcode->HasDstReg) {
607 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
608 				if (inst->U.I.DstReg.Index >= num_orig_temps)
609 					num_orig_temps = inst->U.I.DstReg.Index + 1;
610 			}
611 		}
612 	}
613 
614 	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
615 			sizeof(struct temporary_allocation) * num_orig_temps);
616 	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
617 
618 	/* Pass 2: Determine original temporary lifetimes */
619 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
620 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
621 		/* Instructions inside of loops need to use the ENDLOOP
622 		 * instruction as their LastRead. */
623 		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
624 			int endloops = 1;
625 			struct rc_instruction * ptr;
626 			for(ptr = inst->Next;
627 				ptr != &compiler->Base.Program.Instructions;
628 							ptr = ptr->Next){
629 				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
630 					endloops++;
631 				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
632 					endloops--;
633 					if (endloops <= 0) {
634 						end_loop = ptr;
635 						break;
636 					}
637 				}
638 			}
639 		}
640 
641 		if (inst == end_loop) {
642 			end_loop = NULL;
643 			continue;
644 		}
645 
646 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
647 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
648 				ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
649 			}
650 		}
651 	}
652 
653 	/* Pass 3: Register allocation */
654 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
655 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
656 
657 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
658 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
659 				unsigned int orig = inst->U.I.SrcReg[i].Index;
660 				inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig);
661 
662 				if (ta[orig].Allocated && inst == ta[orig].LastRead)
663 					hwtemps[ta[orig].HwTemp] = false;
664 			}
665 		}
666 
667 		if (opcode->HasDstReg) {
668 			if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
669 				unsigned int orig = inst->U.I.DstReg.Index;
670 				inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig);
671 			}
672 		}
673 	}
674 }
675 
676 /**
677  * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
678  * and the Saturate opcode modifier. Only Absolute is currently transformed.
679  */
transform_nonnative_modifiers(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)680 static int transform_nonnative_modifiers(
681 	struct radeon_compiler *c,
682 	struct rc_instruction *inst,
683 	void* unused)
684 {
685 	const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
686 	unsigned i;
687 
688 	/* Transform ABS(a) to MAX(a, -a). */
689 	for (i = 0; i < opcode->NumSrcRegs; i++) {
690 		if (inst->U.I.SrcReg[i].Abs) {
691 			struct rc_instruction *new_inst;
692 			unsigned temp;
693 
694 			inst->U.I.SrcReg[i].Abs = 0;
695 
696 			temp = rc_find_free_temporary(c);
697 
698 			new_inst = rc_insert_new_instruction(c, inst->Prev);
699 			new_inst->U.I.Opcode = RC_OPCODE_MAX;
700 			new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
701 			new_inst->U.I.DstReg.Index = temp;
702 			new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
703 			new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
704 			new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
705 
706 			memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
707 			inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
708 			inst->U.I.SrcReg[i].Index = temp;
709 			inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
710 		}
711 	}
712 	return 1;
713 }
714 
715 /**
716  * Vertex engine cannot read two inputs or two constants at the same time.
717  * Introduce intermediate MOVs to temporary registers to account for this.
718  */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)719 static int transform_source_conflicts(
720 	struct radeon_compiler *c,
721 	struct rc_instruction* inst,
722 	void* unused)
723 {
724 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
725 
726 	if (opcode->NumSrcRegs == 3) {
727 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
728 		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
729 			int tmpreg = rc_find_free_temporary(c);
730 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
731 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
732 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
733 			inst_mov->U.I.DstReg.Index = tmpreg;
734 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
735 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
736 			inst_mov->U.I.SrcReg[0].Negate = 0;
737 			inst_mov->U.I.SrcReg[0].Abs = 0;
738 
739 			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
740 			inst->U.I.SrcReg[2].Index = tmpreg;
741 			inst->U.I.SrcReg[2].RelAddr = false;
742 		}
743 	}
744 
745 	if (opcode->NumSrcRegs >= 2) {
746 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
747 			int tmpreg = rc_find_free_temporary(c);
748 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
749 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
750 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
751 			inst_mov->U.I.DstReg.Index = tmpreg;
752 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
753 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
754 			inst_mov->U.I.SrcReg[0].Negate = 0;
755 			inst_mov->U.I.SrcReg[0].Abs = 0;
756 
757 			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
758 			inst->U.I.SrcReg[1].Index = tmpreg;
759 			inst->U.I.SrcReg[1].RelAddr = false;
760 		}
761 	}
762 
763 	return 1;
764 }
765 
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)766 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
767 {
768 	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
769 	int i;
770 
771 	for(i = 0; i < 32; ++i) {
772 		if ((compiler->RequiredOutputs & (1U << i)) &&
773 		    !(compiler->Base.Program.OutputsWritten & (1U << i))) {
774 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
775 			inst->U.I.Opcode = RC_OPCODE_MOV;
776 
777 			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
778 			inst->U.I.DstReg.Index = i;
779 			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
780 
781 			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
782 			inst->U.I.SrcReg[0].Index = 0;
783 			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
784 
785 			compiler->Base.Program.OutputsWritten |= 1U << i;
786 		}
787 	}
788 }
789 
dataflow_outputs_mark_used(void * userdata,void * data,void (* callback)(void *,unsigned int,unsigned int))790 static void dataflow_outputs_mark_used(void * userdata, void * data,
791 		void (*callback)(void *, unsigned int, unsigned int))
792 {
793 	struct r300_vertex_program_compiler * c = userdata;
794 	int i;
795 
796 	for(i = 0; i < 32; ++i) {
797 		if (c->RequiredOutputs & (1U << i))
798 			callback(data, i, RC_MASK_XYZW);
799 	}
800 }
801 
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)802 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
803 {
804 	(void) opcode;
805 	(void) reg;
806 
807 	return 1;
808 }
809 
transform_negative_addressing(struct r300_vertex_program_compiler * c,struct rc_instruction * arl,struct rc_instruction * end,int min_offset)810 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
811 					  struct rc_instruction *arl,
812 					  struct rc_instruction *end,
813 					  int min_offset)
814 {
815 	struct rc_instruction *inst, *add;
816 	unsigned const_swizzle;
817 
818 	/* Transform ARL/ARR */
819 	add = rc_insert_new_instruction(&c->Base, arl->Prev);
820 	add->U.I.Opcode = RC_OPCODE_ADD;
821 	add->U.I.DstReg.File = RC_FILE_TEMPORARY;
822 	add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
823 	add->U.I.DstReg.WriteMask = RC_MASK_X;
824 	add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
825 	add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
826 	add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
827 								     min_offset, &const_swizzle);
828 	add->U.I.SrcReg[1].Swizzle = const_swizzle;
829 
830 	arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
831 	arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
832 	arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
833 
834 	/* Rewrite offsets up to and excluding inst. */
835 	for (inst = arl->Next; inst != end; inst = inst->Next) {
836 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
837 
838 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
839 			if (inst->U.I.SrcReg[i].RelAddr)
840 				inst->U.I.SrcReg[i].Index -= min_offset;
841 	}
842 }
843 
rc_emulate_negative_addressing(struct radeon_compiler * compiler,void * user)844 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
845 {
846 	struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
847 	struct rc_instruction *inst, *lastARL = NULL;
848 	int min_offset = 0;
849 
850 	for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
851 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
852 
853 		if (inst->U.I.Opcode == RC_OPCODE_ARL || inst->U.I.Opcode == RC_OPCODE_ARR) {
854 			if (lastARL != NULL && min_offset < 0)
855 				transform_negative_addressing(c, lastARL, inst, min_offset);
856 
857 			lastARL = inst;
858 			min_offset = 0;
859 			continue;
860 		}
861 
862 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
863 			if (inst->U.I.SrcReg[i].RelAddr &&
864 			    inst->U.I.SrcReg[i].Index < 0) {
865 				/* ARL must precede any indirect addressing. */
866 				if (!lastARL) {
867 					rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL/ARR.");
868 					return;
869 				}
870 
871 				if (inst->U.I.SrcReg[i].Index < min_offset)
872 					min_offset = inst->U.I.SrcReg[i].Index;
873 			}
874 		}
875 	}
876 
877 	if (lastARL != NULL && min_offset < 0)
878 		transform_negative_addressing(c, lastARL, inst, min_offset);
879 }
880 
881 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
882 	.IsNative = &swizzle_is_native,
883 	.Split = 0 /* should never be called */
884 };
885 
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)886 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
887 {
888 	int is_r500 = c->Base.is_r500;
889 	int opt = !c->Base.disable_optimizations;
890 
891 	/* Lists of instruction transformations. */
892 	struct radeon_program_transformation alu_rewrite_r500[] = {
893 		{ &r300_transform_vertex_alu, 0 },
894 		{ &r300_transform_trig_scale_vertex, 0 },
895 		{ 0, 0 }
896 	};
897 
898 	struct radeon_program_transformation alu_rewrite_r300[] = {
899 		{ &r300_transform_vertex_alu, 0 },
900 		{ &r300_transform_trig_simple, 0 },
901 		{ 0, 0 }
902 	};
903 
904 	/* Note: These passes have to be done seperately from ALU rewrite,
905 	 * otherwise non-native ALU instructions with source conflits
906 	 * or non-native modifiers will not be treated properly.
907 	 */
908 	struct radeon_program_transformation emulate_modifiers[] = {
909 		{ &transform_nonnative_modifiers, 0 },
910 		{ 0, 0 }
911 	};
912 
913 	struct radeon_program_transformation resolve_src_conflicts[] = {
914 		{ &transform_source_conflicts, 0 },
915 		{ 0, 0 }
916 	};
917 
918 	/* List of compiler passes. */
919 	struct radeon_compiler_pass vs_list[] = {
920 		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
921 		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
922 		{"emulate branches",		1, !is_r500,	rc_emulate_branches,		NULL},
923 		{"emulate negative addressing", 1, 1,		rc_emulate_negative_addressing,	NULL},
924 		{"native rewrite",		1, is_r500,	rc_local_transform,		alu_rewrite_r500},
925 		{"native rewrite",		1, !is_r500,	rc_local_transform,		alu_rewrite_r300},
926 		{"emulate modifiers",		1, !is_r500,	rc_local_transform,		emulate_modifiers},
927 		{"deadcode",			1, opt,		rc_dataflow_deadcode,		dataflow_outputs_mark_used},
928 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
929 		/* This pass must be done after optimizations. */
930 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
931 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
932 		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
933 		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
934 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
935 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
936 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
937 		{NULL, 0, 0, NULL, NULL}
938 	};
939 
940 	c->Base.type = RC_VERTEX_PROGRAM;
941 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
942 
943 	rc_run_compiler(&c->Base, vs_list);
944 
945 	c->code->InputsRead = c->Base.Program.InputsRead;
946 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
947 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
948 }
949