1 /*
2  * Copyright (C) 1995-2008 University of Karlsruhe.  All right reserved.
3  *
4  * This file is part of libFirm.
5  *
6  * This file may be distributed and/or modified under the terms of the
7  * GNU General Public License version 2 as published by the Free Software
8  * Foundation and appearing in the file LICENSE.GPL included in the
9  * packaging of this file.
10  *
11  * Licensees holding valid libFirm Professional Edition licenses may use
12  * this file in accordance with the libFirm Commercial License.
13  * Agreement provided with the Software.
14  *
15  * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
16  * WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE.
18  */
19 
20 /**
21  * @file
22  * @brief       Implements several optimizations for IA32.
23  * @author      Matthias Braun, Christian Wuerdig
24  */
25 #include "config.h"
26 
27 #include "irnode.h"
28 #include "irprog_t.h"
29 #include "ircons.h"
30 #include "irtools.h"
31 #include "firm_types.h"
32 #include "iredges.h"
33 #include "tv.h"
34 #include "irgmod.h"
35 #include "irgwalk.h"
36 #include "heights.h"
37 #include "irprintf.h"
38 #include "irdump.h"
39 #include "error.h"
40 
41 #include "be_t.h"
42 #include "beabi.h"
43 #include "benode.h"
44 #include "besched.h"
45 #include "bepeephole.h"
46 
47 #include "ia32_new_nodes.h"
48 #include "ia32_optimize.h"
49 #include "bearch_ia32_t.h"
50 #include "gen_ia32_regalloc_if.h"
51 #include "ia32_common_transform.h"
52 #include "ia32_transform.h"
53 #include "ia32_dbg_stat.h"
54 #include "ia32_architecture.h"
55 
56 DEBUG_ONLY(static firm_dbg_module_t *dbg = NULL;)
57 
copy_mark(const ir_node * old,ir_node * newn)58 static void copy_mark(const ir_node *old, ir_node *newn)
59 {
60 	if (is_ia32_is_reload(old))
61 		set_ia32_is_reload(newn);
62 	if (is_ia32_is_spill(old))
63 		set_ia32_is_spill(newn);
64 	if (is_ia32_is_remat(old))
65 		set_ia32_is_remat(newn);
66 }
67 
68 typedef enum produces_flag_t {
69 	produces_no_flag,
70 	produces_zero_sign,
71 	produces_zero_in_carry
72 } produces_flag_t;
73 
74 /**
75  * Return which usable flag the given node produces about the result.
76  * That is zero (ZF) and sign(SF).
77  * We do not check for carry (CF) or overflow (OF).
78  *
79  * @param node  the node to check
80  * @param pn    the projection number of the used result
81  */
check_produces_zero_sign(ir_node * node,int pn)82 static produces_flag_t check_produces_zero_sign(ir_node *node, int pn)
83 {
84 	ir_node                     *count;
85 	const ia32_immediate_attr_t *imm_attr;
86 
87 	if (!is_ia32_irn(node))
88 		return produces_no_flag;
89 
90 	switch (get_ia32_irn_opcode(node)) {
91 		case iro_ia32_Add:
92 		case iro_ia32_Adc:
93 		case iro_ia32_And:
94 		case iro_ia32_Or:
95 		case iro_ia32_Xor:
96 		case iro_ia32_Sub:
97 		case iro_ia32_Sbb:
98 		case iro_ia32_Neg:
99 		case iro_ia32_Inc:
100 		case iro_ia32_Dec:
101 			break;
102 
103 		case iro_ia32_ShlD:
104 		case iro_ia32_ShrD:
105 			assert((int)n_ia32_ShlD_count == (int)n_ia32_ShrD_count);
106 			count = get_irn_n(node, n_ia32_ShlD_count);
107 			goto check_shift_amount;
108 
109 		case iro_ia32_Shl:
110 		case iro_ia32_Shr:
111 		case iro_ia32_Sar:
112 			assert((int)n_ia32_Shl_count == (int)n_ia32_Shr_count
113 					&& (int)n_ia32_Shl_count == (int)n_ia32_Sar_count);
114 			count = get_irn_n(node, n_ia32_Shl_count);
115 check_shift_amount:
116 			/* when shift count is zero the flags are not affected, so we can only
117 			 * do this for constants != 0 */
118 			if (!is_ia32_Immediate(count))
119 				return produces_no_flag;
120 
121 			imm_attr = get_ia32_immediate_attr_const(count);
122 			if (imm_attr->symconst != NULL)
123 				return produces_no_flag;
124 			if ((imm_attr->offset & 0x1f) == 0)
125 				return produces_no_flag;
126 			break;
127 
128 		case iro_ia32_Mul:
129 			return pn == pn_ia32_Mul_res_high ?
130 				produces_zero_in_carry : produces_no_flag;
131 
132 		default:
133 			return produces_no_flag;
134 	}
135 
136 	return pn == pn_ia32_res ? produces_zero_sign : produces_no_flag;
137 }
138 
139 /**
140  * Replace Cmp(x, 0) by a Test(x, x)
141  */
peephole_ia32_Cmp(ir_node * const node)142 static void peephole_ia32_Cmp(ir_node *const node)
143 {
144 	ir_node                     *right;
145 	ir_graph                    *irg;
146 	ia32_immediate_attr_t const *imm;
147 	dbg_info                    *dbgi;
148 	ir_node                     *block;
149 	ir_node                     *noreg;
150 	ir_node                     *nomem;
151 	ir_node                     *op;
152 	ia32_attr_t           const *attr;
153 	int                          ins_permuted;
154 	ir_node                     *test;
155 	arch_register_t       const *reg;
156 
157 	if (get_ia32_op_type(node) != ia32_Normal)
158 		return;
159 
160 	right = get_irn_n(node, n_ia32_Cmp_right);
161 	if (!is_ia32_Immediate(right))
162 		return;
163 
164 	imm = get_ia32_immediate_attr_const(right);
165 	if (imm->symconst != NULL || imm->offset != 0)
166 		return;
167 
168 	dbgi         = get_irn_dbg_info(node);
169 	irg          = get_irn_irg(node);
170 	block        = get_nodes_block(node);
171 	noreg        = ia32_new_NoReg_gp(irg);
172 	nomem        = get_irg_no_mem(current_ir_graph);
173 	op           = get_irn_n(node, n_ia32_Cmp_left);
174 	attr         = get_ia32_attr(node);
175 	ins_permuted = attr->data.ins_permuted;
176 
177 	if (is_ia32_Cmp(node)) {
178 		test = new_bd_ia32_Test(dbgi, block, noreg, noreg, nomem,
179 		                        op, op, ins_permuted);
180 	} else {
181 		test = new_bd_ia32_Test8Bit(dbgi, block, noreg, noreg, nomem,
182 		                            op, op, ins_permuted);
183 	}
184 	set_ia32_ls_mode(test, get_ia32_ls_mode(node));
185 
186 	reg = arch_get_irn_register_out(node, pn_ia32_Cmp_eflags);
187 	arch_set_irn_register_out(test, pn_ia32_Test_eflags, reg);
188 
189 	foreach_out_edge_safe(node, edge) {
190 		ir_node *const user = get_edge_src_irn(edge);
191 
192 		if (is_Proj(user))
193 			exchange(user, test);
194 	}
195 
196 	sched_add_before(node, test);
197 	copy_mark(node, test);
198 	be_peephole_exchange(node, test);
199 }
200 
201 /**
202  * Peephole optimization for Test instructions.
203  * - Remove the Test, if an appropriate flag was produced which is still live
204  * - Change a Test(x, c) to 8Bit, if 0 <= c < 256 (3 byte shorter opcode)
205  */
peephole_ia32_Test(ir_node * node)206 static void peephole_ia32_Test(ir_node *node)
207 {
208 	ir_node *left  = get_irn_n(node, n_ia32_Test_left);
209 	ir_node *right = get_irn_n(node, n_ia32_Test_right);
210 
211 	assert((int)n_ia32_Test_left == (int)n_ia32_Test8Bit_left
212 			&& (int)n_ia32_Test_right == (int)n_ia32_Test8Bit_right);
213 
214 	if (left == right) { /* we need a test for 0 */
215 		ir_node         *block = get_nodes_block(node);
216 		int              pn    = pn_ia32_res;
217 		ir_node         *op    = left;
218 		ir_node         *flags_proj;
219 		ir_mode         *flags_mode;
220 		ir_mode         *op_mode;
221 		ir_node         *schedpoint;
222 		produces_flag_t  produced;
223 
224 		if (get_nodes_block(left) != block)
225 			return;
226 
227 		if (is_Proj(op)) {
228 			pn = get_Proj_proj(op);
229 			op = get_Proj_pred(op);
230 		}
231 
232 		/* walk schedule up and abort when we find left or some other node
233 		 * destroys the flags */
234 		schedpoint = node;
235 		for (;;) {
236 			schedpoint = sched_prev(schedpoint);
237 			if (schedpoint == op)
238 				break;
239 			if (arch_irn_is(schedpoint, modify_flags))
240 				return;
241 			if (schedpoint == block)
242 				panic("couldn't find left");
243 		}
244 
245 		produced = check_produces_zero_sign(op, pn);
246 		if (produced == produces_no_flag)
247 			return;
248 
249 		/* make sure users only look at the sign/zero flag */
250 		foreach_out_edge(node, edge) {
251 			ir_node              *user = get_edge_src_irn(edge);
252 			ia32_condition_code_t cc  = get_ia32_condcode(user);
253 
254 			if (cc == ia32_cc_equal || cc == ia32_cc_not_equal)
255 				continue;
256 			if (produced == produces_zero_sign
257 				&& (cc == ia32_cc_sign || cc == ia32_cc_not_sign)) {
258 				continue;
259 			}
260 			return;
261 		}
262 
263 		op_mode = get_ia32_ls_mode(op);
264 		if (op_mode == NULL)
265 			op_mode = get_irn_mode(op);
266 
267 		/* Make sure we operate on the same bit size */
268 		if (get_mode_size_bits(op_mode) != get_mode_size_bits(get_ia32_ls_mode(node)))
269 			return;
270 
271 		if (produced == produces_zero_in_carry) {
272 			/* patch users to look at the carry instead of the zero flag */
273 			foreach_out_edge(node, edge) {
274 				ir_node              *user = get_edge_src_irn(edge);
275 				ia32_condition_code_t cc   = get_ia32_condcode(user);
276 
277 				switch (cc) {
278 				case ia32_cc_equal:     cc = ia32_cc_above_equal; break;
279 				case ia32_cc_not_equal: cc = ia32_cc_below;       break;
280 				default: panic("unexpected pn");
281 				}
282 				set_ia32_condcode(user, cc);
283 			}
284 		}
285 
286 		if (get_irn_mode(op) != mode_T) {
287 			set_irn_mode(op, mode_T);
288 
289 			/* If there are other users, reroute them to result proj */
290 			if (get_irn_n_edges(op) != 2) {
291 				ir_node *res = new_r_Proj(op, mode_Iu, pn_ia32_res);
292 				edges_reroute_except(op, res, res);
293 			}
294 		} else {
295 			if (get_irn_n_edges(left) == 2)
296 				kill_node(left);
297 		}
298 
299 		flags_mode = ia32_reg_classes[CLASS_ia32_flags].mode;
300 		flags_proj = new_r_Proj(op, flags_mode, pn_ia32_flags);
301 		arch_set_irn_register(flags_proj, &ia32_registers[REG_EFLAGS]);
302 
303 		assert(get_irn_mode(node) != mode_T);
304 
305 		be_peephole_exchange(node, flags_proj);
306 	} else if (is_ia32_Immediate(right)) {
307 		ia32_immediate_attr_t const *const imm = get_ia32_immediate_attr_const(right);
308 		unsigned                           offset;
309 
310 		/* A test with a symconst is rather strange, but better safe than sorry */
311 		if (imm->symconst != NULL)
312 			return;
313 
314 		offset = imm->offset;
315 		if (get_ia32_op_type(node) == ia32_AddrModeS) {
316 			ia32_attr_t *const attr = get_ia32_attr(node);
317 
318 			if ((offset & 0xFFFFFF00) == 0) {
319 				/* attr->am_offs += 0; */
320 			} else if ((offset & 0xFFFF00FF) == 0) {
321 				ir_node *imm_node = ia32_create_Immediate(NULL, 0, offset>>8);
322 				set_irn_n(node, n_ia32_Test_right, imm_node);
323 				attr->am_offs += 1;
324 			} else if ((offset & 0xFF00FFFF) == 0) {
325 				ir_node *imm_node = ia32_create_Immediate(NULL, 0, offset>>16);
326 				set_irn_n(node, n_ia32_Test_right, imm_node);
327 				attr->am_offs += 2;
328 			} else if ((offset & 0x00FFFFFF) == 0) {
329 				ir_node *imm_node = ia32_create_Immediate(NULL, 0, offset>>24);
330 				set_irn_n(node, n_ia32_Test_right, imm_node);
331 				attr->am_offs += 3;
332 			} else {
333 				return;
334 			}
335 		} else if (offset < 256) {
336 			arch_register_t const* const reg = arch_get_irn_register(left);
337 
338 			if (reg != &ia32_registers[REG_EAX] &&
339 					reg != &ia32_registers[REG_EBX] &&
340 					reg != &ia32_registers[REG_ECX] &&
341 					reg != &ia32_registers[REG_EDX]) {
342 				return;
343 			}
344 		} else {
345 			return;
346 		}
347 
348 		/* Technically we should build a Test8Bit because of the register
349 		 * constraints, but nobody changes registers at this point anymore. */
350 		set_ia32_ls_mode(node, mode_Bu);
351 	}
352 }
353 
354 /**
355  * AMD Athlon works faster when RET is not destination of
356  * conditional jump or directly preceded by other jump instruction.
357  * Can be avoided by placing a Rep prefix before the return.
358  */
peephole_ia32_Return(ir_node * node)359 static void peephole_ia32_Return(ir_node *node)
360 {
361 	if (!ia32_cg_config.use_pad_return)
362 		return;
363 
364 	/* check if this return is the first on the block */
365 	sched_foreach_reverse_from(node, irn) {
366 		switch (get_irn_opcode(irn)) {
367 		case beo_Return:
368 			/* the return node itself, ignore */
369 			continue;
370 		case iro_Start:
371 		case beo_Start:
372 			/* ignore no code generated */
373 			continue;
374 		case beo_IncSP:
375 			/* arg, IncSP 0 nodes might occur, ignore these */
376 			if (be_get_IncSP_offset(irn) == 0)
377 				continue;
378 			return;
379 		case iro_Phi:
380 			continue;
381 		default:
382 			return;
383 		}
384 	}
385 
386 	/* ensure, that the 3 byte return is generated */
387 	be_Return_set_emit_pop(node, 1);
388 }
389 
390 /* only optimize up to 48 stores behind IncSPs */
391 #define MAXPUSH_OPTIMIZE    48
392 
393 /**
394  * Tries to create Push's from IncSP, Store combinations.
395  * The Stores are replaced by Push's, the IncSP is modified
396  * (possibly into IncSP 0, but not removed).
397  */
peephole_IncSP_Store_to_push(ir_node * irn)398 static void peephole_IncSP_Store_to_push(ir_node *irn)
399 {
400 	int       i;
401 	int       maxslot;
402 	int       inc_ofs;
403 	ir_node  *node;
404 	ir_node  *stores[MAXPUSH_OPTIMIZE];
405 	ir_node  *block;
406 	ir_graph *irg;
407 	ir_node  *curr_sp;
408 	ir_mode  *spmode;
409 	ir_node  *first_push = NULL;
410 
411 	memset(stores, 0, sizeof(stores));
412 
413 	assert(be_is_IncSP(irn));
414 
415 	inc_ofs = be_get_IncSP_offset(irn);
416 	if (inc_ofs < 4)
417 		return;
418 
419 	/*
420 	 * We first walk the schedule after the IncSP node as long as we find
421 	 * suitable Stores that could be transformed to a Push.
422 	 * We save them into the stores array which is sorted by the frame offset/4
423 	 * attached to the node
424 	 */
425 	maxslot = -1;
426 	for (node = sched_next(irn); !sched_is_end(node); node = sched_next(node)) {
427 		ir_node *mem;
428 		int offset;
429 		int storeslot;
430 
431 		/* it has to be a Store */
432 		if (!is_ia32_Store(node))
433 			break;
434 
435 		/* it has to use our sp value */
436 		if (get_irn_n(node, n_ia32_base) != irn)
437 			continue;
438 		/* Store has to be attached to NoMem */
439 		mem = get_irn_n(node, n_ia32_mem);
440 		if (!is_NoMem(mem))
441 			continue;
442 
443 		/* unfortunately we can't support the full AMs possible for push at the
444 		 * moment. TODO: fix this */
445 		if (!is_ia32_NoReg_GP(get_irn_n(node, n_ia32_index)))
446 			break;
447 
448 		offset = get_ia32_am_offs_int(node);
449 		/* we should NEVER access uninitialized stack BELOW the current SP */
450 		assert(offset >= 0);
451 
452 		/* storing at half-slots is bad */
453 		if ((offset & 3) != 0)
454 			break;
455 
456 		if (inc_ofs - 4 < offset || offset >= MAXPUSH_OPTIMIZE * 4)
457 			continue;
458 		storeslot = offset >> 2;
459 
460 		/* storing into the same slot twice is bad (and shouldn't happen...) */
461 		if (stores[storeslot] != NULL)
462 			break;
463 
464 		stores[storeslot] = node;
465 		if (storeslot > maxslot)
466 			maxslot = storeslot;
467 	}
468 
469 	curr_sp = irn;
470 
471 	for (i = -1; i < maxslot; ++i) {
472 		if (stores[i + 1] == NULL)
473 			break;
474 	}
475 
476 	/* walk through the Stores and create Pushs for them */
477 	block  = get_nodes_block(irn);
478 	spmode = get_irn_mode(irn);
479 	irg    = get_irn_irg(irn);
480 	for (; i >= 0; --i) {
481 		const arch_register_t *spreg;
482 		ir_node *push;
483 		ir_node *val, *mem, *mem_proj;
484 		ir_node *store = stores[i];
485 		ir_node *noreg = ia32_new_NoReg_gp(irg);
486 
487 		val = get_irn_n(store, n_ia32_unary_op);
488 		mem = get_irn_n(store, n_ia32_mem);
489 		spreg = arch_get_irn_register(curr_sp);
490 
491 		push = new_bd_ia32_Push(get_irn_dbg_info(store), block, noreg, noreg,
492 		                        mem, val, curr_sp);
493 		copy_mark(store, push);
494 
495 		if (first_push == NULL)
496 			first_push = push;
497 
498 		sched_add_after(skip_Proj(curr_sp), push);
499 
500 		/* create stackpointer Proj */
501 		curr_sp = new_r_Proj(push, spmode, pn_ia32_Push_stack);
502 		arch_set_irn_register(curr_sp, spreg);
503 
504 		/* create memory Proj */
505 		mem_proj = new_r_Proj(push, mode_M, pn_ia32_Push_M);
506 
507 		/* rewire Store Projs */
508 		foreach_out_edge_safe(store, edge) {
509 			ir_node *proj = get_edge_src_irn(edge);
510 			if (!is_Proj(proj))
511 				continue;
512 			switch (get_Proj_proj(proj)) {
513 			case pn_ia32_Store_M:
514 				exchange(proj, mem_proj);
515 				break;
516 			default:
517 				panic("unexpected Proj on Store->IncSp");
518 			}
519 		}
520 
521 		/* use the memproj now */
522 		be_peephole_exchange(store, push);
523 
524 		inc_ofs -= 4;
525 	}
526 
527 	foreach_out_edge_safe(irn, edge) {
528 		ir_node *const src = get_edge_src_irn(edge);
529 		int      const pos = get_edge_src_pos(edge);
530 
531 		if (src == first_push)
532 			continue;
533 
534 		set_irn_n(src, pos, curr_sp);
535 	}
536 
537 	be_set_IncSP_offset(irn, inc_ofs);
538 }
539 
540 #if 0
541 /**
542  * Creates a Push instruction before the given schedule point.
543  *
544  * @param dbgi        debug info
545  * @param block       the block
546  * @param stack       the previous stack value
547  * @param schedpoint  the new node is added before this node
548  * @param reg         the register to pop
549  *
550  * @return the new stack value
551  */
552 static ir_node *create_push(dbg_info *dbgi, ir_node *block,
553                             ir_node *stack, ir_node *schedpoint)
554 {
555 	const arch_register_t *esp = &ia32_registers[REG_ESP];
556 
557 	ir_node *val   = ia32_new_NoReg_gp(cg);
558 	ir_node *noreg = ia32_new_NoReg_gp(cg);
559 	ir_graph *irg  = get_irn_irg(block);
560 	ir_node *nomem = get_irg_no_mem(irg);
561 	ir_node *push  = new_bd_ia32_Push(dbgi, block, noreg, noreg, nomem, val, stack);
562 	sched_add_before(schedpoint, push);
563 
564 	stack = new_r_Proj(push, mode_Iu, pn_ia32_Push_stack);
565 	arch_set_irn_register(stack, esp);
566 
567 	return stack;
568 }
569 
570 static void peephole_store_incsp(ir_node *store)
571 {
572 	dbg_info *dbgi;
573 	ir_node  *block;
574 	ir_node  *noreg;
575 	ir_node  *mem;
576 	ir_node  *push;
577 	ir_node  *val;
578 	ir_node  *base;
579 	ir_node  *index;
580 	ir_node  *am_base = get_irn_n(store, n_ia32_Store_base);
581 	if (!be_is_IncSP(am_base)
582 			|| get_nodes_block(am_base) != get_nodes_block(store))
583 		return;
584 	mem = get_irn_n(store, n_ia32_Store_mem);
585 	if (!is_ia32_NoReg_GP(get_irn_n(store, n_ia32_Store_index))
586 			|| !is_NoMem(mem))
587 		return;
588 
589 	int incsp_offset = be_get_IncSP_offset(am_base);
590 	if (incsp_offset <= 0)
591 		return;
592 
593 	/* we have to be at offset 0 */
594 	int my_offset = get_ia32_am_offs_int(store);
595 	if (my_offset != 0) {
596 		/* TODO here: find out whether there is a store with offset 0 before
597 		 * us and whether we can move it down to our place */
598 		return;
599 	}
600 	ir_mode *ls_mode = get_ia32_ls_mode(store);
601 	int my_store_size = get_mode_size_bytes(ls_mode);
602 
603 	if (my_offset + my_store_size > incsp_offset)
604 		return;
605 
606 	/* correctness checking:
607 		- noone else must write to that stackslot
608 		    (because after translation incsp won't allocate it anymore)
609 	*/
610 	sched_foreach_reverse_from(store, node) {
611 		int i, arity;
612 
613 		if (node == am_base)
614 			break;
615 
616 		/* make sure noone else can use the space on the stack */
617 		arity = get_irn_arity(node);
618 		for (i = 0; i < arity; ++i) {
619 			ir_node *pred = get_irn_n(node, i);
620 			if (pred != am_base)
621 				continue;
622 
623 			if (i == n_ia32_base &&
624 					(get_ia32_op_type(node) == ia32_AddrModeS
625 					 || get_ia32_op_type(node) == ia32_AddrModeD)) {
626 				int      node_offset  = get_ia32_am_offs_int(node);
627 				ir_mode *node_ls_mode = get_ia32_ls_mode(node);
628 				int      node_size    = get_mode_size_bytes(node_ls_mode);
629 				/* overlapping with our position? abort */
630 				if (node_offset < my_offset + my_store_size
631 						&& node_offset + node_size >= my_offset)
632 					return;
633 				/* otherwise it's fine */
634 				continue;
635 			}
636 
637 			/* strange use of esp: abort */
638 			return;
639 		}
640 	}
641 
642 	/* all ok, change to push */
643 	dbgi  = get_irn_dbg_info(store);
644 	block = get_nodes_block(store);
645 	noreg = ia32_new_NoReg_gp(cg);
646 	val   = get_irn_n(store, n_ia32_Store_val);
647 
648 	push  = new_bd_ia32_Push(dbgi, block, noreg, noreg, mem,
649 
650 	create_push(dbgi, current_ir_graph, block, am_base, store);
651 }
652 #endif
653 
654 /**
655  * Return true if a mode can be stored in the GP register set
656  */
mode_needs_gp_reg(ir_mode * mode)657 static inline int mode_needs_gp_reg(ir_mode *mode)
658 {
659         if (mode == ia32_mode_fpcw)
660                 return 0;
661         if (get_mode_size_bits(mode) > 32)
662                 return 0;
663         return mode_is_int(mode) || mode_is_reference(mode) || mode == mode_b;
664 }
665 
666 /**
667  * Tries to create Pops from Load, IncSP combinations.
668  * The Loads are replaced by Pops, the IncSP is modified
669  * (possibly into IncSP 0, but not removed).
670  */
peephole_Load_IncSP_to_pop(ir_node * irn)671 static void peephole_Load_IncSP_to_pop(ir_node *irn)
672 {
673 	const arch_register_t *esp = &ia32_registers[REG_ESP];
674 	int      i, maxslot, inc_ofs, ofs;
675 	ir_node  *node, *pred_sp, *block;
676 	ir_node  *loads[MAXPUSH_OPTIMIZE];
677 	unsigned regmask = 0;
678 	unsigned copymask = ~0;
679 
680 	memset(loads, 0, sizeof(loads));
681 	assert(be_is_IncSP(irn));
682 
683 	inc_ofs = -be_get_IncSP_offset(irn);
684 	if (inc_ofs < 4)
685 		return;
686 
687 	/*
688 	 * We first walk the schedule before the IncSP node as long as we find
689 	 * suitable Loads that could be transformed to a Pop.
690 	 * We save them into the stores array which is sorted by the frame offset/4
691 	 * attached to the node
692 	 */
693 	maxslot = -1;
694 	pred_sp = be_get_IncSP_pred(irn);
695 	for (node = sched_prev(irn); !sched_is_end(node); node = sched_prev(node)) {
696 		int offset;
697 		int loadslot;
698 		const arch_register_t *sreg, *dreg;
699 
700 		/* it has to be a Load */
701 		if (!is_ia32_Load(node)) {
702 			if (be_is_Copy(node)) {
703 				if (!mode_needs_gp_reg(get_irn_mode(node))) {
704 					/* not a GP copy, ignore */
705 					continue;
706 				}
707 				dreg = arch_get_irn_register(node);
708 				sreg = arch_get_irn_register(be_get_Copy_op(node));
709 				if (regmask & copymask & (1 << sreg->index)) {
710 					break;
711 				}
712 				if (regmask & copymask & (1 << dreg->index)) {
713 					break;
714 				}
715 				/* we CAN skip Copies if neither the destination nor the source
716 				 * is not in our regmask, ie none of our future Pop will overwrite it */
717 				regmask |= (1 << dreg->index) | (1 << sreg->index);
718 				copymask &= ~((1 << dreg->index) | (1 << sreg->index));
719 				continue;
720 			}
721 			break;
722 		}
723 
724 		/* we can handle only GP loads */
725 		if (!mode_needs_gp_reg(get_ia32_ls_mode(node)))
726 			continue;
727 
728 		/* it has to use our predecessor sp value */
729 		if (get_irn_n(node, n_ia32_base) != pred_sp) {
730 			/* it would be ok if this load does not use a Pop result,
731 			 * but we do not check this */
732 			break;
733 		}
734 
735 		/* should have NO index */
736 		if (!is_ia32_NoReg_GP(get_irn_n(node, n_ia32_index)))
737 			break;
738 
739 		offset = get_ia32_am_offs_int(node);
740 		/* we should NEVER access uninitialized stack BELOW the current SP */
741 		assert(offset >= 0);
742 
743 		/* storing at half-slots is bad */
744 		if ((offset & 3) != 0)
745 			break;
746 
747 		if (offset < 0 || offset >= MAXPUSH_OPTIMIZE * 4)
748 			continue;
749 		/* ignore those outside the possible windows */
750 		if (offset > inc_ofs - 4)
751 			continue;
752 		loadslot = offset >> 2;
753 
754 		/* loading from the same slot twice is bad (and shouldn't happen...) */
755 		if (loads[loadslot] != NULL)
756 			break;
757 
758 		dreg = arch_get_irn_register_out(node, pn_ia32_Load_res);
759 		if (regmask & (1 << dreg->index)) {
760 			/* this register is already used */
761 			break;
762 		}
763 		regmask |= 1 << dreg->index;
764 
765 		loads[loadslot] = node;
766 		if (loadslot > maxslot)
767 			maxslot = loadslot;
768 	}
769 
770 	if (maxslot < 0)
771 		return;
772 
773 	/* find the first slot */
774 	for (i = maxslot; i >= 0; --i) {
775 		ir_node *load = loads[i];
776 
777 		if (load == NULL)
778 			break;
779 	}
780 
781 	ofs = inc_ofs - (maxslot + 1) * 4;
782 	inc_ofs = (i+1) * 4;
783 
784 	/* create a new IncSP if needed */
785 	block = get_nodes_block(irn);
786 	if (inc_ofs > 0) {
787 		pred_sp = be_new_IncSP(esp, block, pred_sp, -inc_ofs, be_get_IncSP_align(irn));
788 		sched_add_before(irn, pred_sp);
789 	}
790 
791 	/* walk through the Loads and create Pops for them */
792 	for (++i; i <= maxslot; ++i) {
793 		ir_node *load = loads[i];
794 		ir_node *mem, *pop;
795 		const arch_register_t *reg;
796 
797 		mem = get_irn_n(load, n_ia32_mem);
798 		reg = arch_get_irn_register_out(load, pn_ia32_Load_res);
799 
800 		pop = new_bd_ia32_Pop(get_irn_dbg_info(load), block, mem, pred_sp);
801 		arch_set_irn_register_out(pop, pn_ia32_Load_res, reg);
802 
803 		copy_mark(load, pop);
804 
805 		/* create stackpointer Proj */
806 		pred_sp = new_r_Proj(pop, mode_Iu, pn_ia32_Pop_stack);
807 		arch_set_irn_register(pred_sp, esp);
808 
809 		sched_add_before(irn, pop);
810 
811 		/* rewire now */
812 		foreach_out_edge_safe(load, edge) {
813 			ir_node *proj = get_edge_src_irn(edge);
814 
815 			set_Proj_pred(proj, pop);
816 		}
817 
818 		/* we can remove the Load now */
819 		sched_remove(load);
820 		kill_node(load);
821 	}
822 
823 	be_set_IncSP_offset(irn, -ofs);
824 	be_set_IncSP_pred(irn, pred_sp);
825 }
826 
827 
828 /**
829  * Find a free GP register if possible, else return NULL.
830  */
get_free_gp_reg(ir_graph * irg)831 static const arch_register_t *get_free_gp_reg(ir_graph *irg)
832 {
833 	be_irg_t *birg = be_birg_from_irg(irg);
834 	int i;
835 
836 	for (i = 0; i < N_ia32_gp_REGS; ++i) {
837 		const arch_register_t *reg = &ia32_reg_classes[CLASS_ia32_gp].regs[i];
838 		if (!rbitset_is_set(birg->allocatable_regs, reg->global_index))
839 			continue;
840 
841 		if (be_peephole_get_value(reg->global_index) == NULL)
842 			return reg;
843 	}
844 
845 	return NULL;
846 }
847 
848 /**
849  * Creates a Pop instruction before the given schedule point.
850  *
851  * @param dbgi        debug info
852  * @param block       the block
853  * @param stack       the previous stack value
854  * @param schedpoint  the new node is added before this node
855  * @param reg         the register to pop
856  *
857  * @return the new stack value
858  */
create_pop(dbg_info * dbgi,ir_node * block,ir_node * stack,ir_node * schedpoint,const arch_register_t * reg)859 static ir_node *create_pop(dbg_info *dbgi, ir_node *block,
860                            ir_node *stack, ir_node *schedpoint,
861                            const arch_register_t *reg)
862 {
863 	const arch_register_t *esp = &ia32_registers[REG_ESP];
864 	ir_graph *irg = get_irn_irg(block);
865 	ir_node *pop;
866 	ir_node *keep;
867 	ir_node *val;
868 	ir_node *in[1];
869 
870 	pop   = new_bd_ia32_Pop(dbgi, block, get_irg_no_mem(irg), stack);
871 
872 	stack = new_r_Proj(pop, mode_Iu, pn_ia32_Pop_stack);
873 	arch_set_irn_register(stack, esp);
874 	val   = new_r_Proj(pop, mode_Iu, pn_ia32_Pop_res);
875 	arch_set_irn_register(val, reg);
876 
877 	sched_add_before(schedpoint, pop);
878 
879 	in[0] = val;
880 	keep  = be_new_Keep(block, 1, in);
881 	sched_add_before(schedpoint, keep);
882 
883 	return stack;
884 }
885 
886 /**
887  * Optimize an IncSp by replacing it with Push/Pop.
888  */
peephole_be_IncSP(ir_node * node)889 static void peephole_be_IncSP(ir_node *node)
890 {
891 	const arch_register_t *esp = &ia32_registers[REG_ESP];
892 	const arch_register_t *reg;
893 	dbg_info              *dbgi;
894 	ir_node               *block;
895 	ir_node               *stack;
896 	int                    offset;
897 
898 	/* first optimize incsp->incsp combinations */
899 	node = be_peephole_IncSP_IncSP(node);
900 
901 	/* transform IncSP->Store combinations to Push where possible */
902 	peephole_IncSP_Store_to_push(node);
903 
904 	/* transform Load->IncSP combinations to Pop where possible */
905 	peephole_Load_IncSP_to_pop(node);
906 
907 	if (arch_get_irn_register(node) != esp)
908 		return;
909 
910 	/* replace IncSP -4 by Pop freereg when possible */
911 	offset = be_get_IncSP_offset(node);
912 	if ((offset != -8 || ia32_cg_config.use_add_esp_8) &&
913 	    (offset != -4 || ia32_cg_config.use_add_esp_4) &&
914 	    (offset != +4 || ia32_cg_config.use_sub_esp_4) &&
915 	    (offset != +8 || ia32_cg_config.use_sub_esp_8))
916 		return;
917 
918 	if (offset < 0) {
919 		/* we need a free register for pop */
920 		reg = get_free_gp_reg(get_irn_irg(node));
921 		if (reg == NULL)
922 			return;
923 
924 		dbgi  = get_irn_dbg_info(node);
925 		block = get_nodes_block(node);
926 		stack = be_get_IncSP_pred(node);
927 
928 		stack = create_pop(dbgi, block, stack, node, reg);
929 
930 		if (offset == -8) {
931 			stack = create_pop(dbgi, block, stack, node, reg);
932 		}
933 	} else {
934 		dbgi  = get_irn_dbg_info(node);
935 		block = get_nodes_block(node);
936 		stack = be_get_IncSP_pred(node);
937 		stack = new_bd_ia32_PushEax(dbgi, block, stack);
938 		arch_set_irn_register(stack, esp);
939 		sched_add_before(node, stack);
940 
941 		if (offset == +8) {
942 			stack = new_bd_ia32_PushEax(dbgi, block, stack);
943 			arch_set_irn_register(stack, esp);
944 			sched_add_before(node, stack);
945 		}
946 	}
947 
948 	be_peephole_exchange(node, stack);
949 }
950 
951 /**
952  * Peephole optimisation for ia32_Const's
953  */
peephole_ia32_Const(ir_node * node)954 static void peephole_ia32_Const(ir_node *node)
955 {
956 	const ia32_immediate_attr_t *attr = get_ia32_immediate_attr_const(node);
957 	const arch_register_t       *reg;
958 	ir_node                     *block;
959 	dbg_info                    *dbgi;
960 	ir_node                     *xorn;
961 
962 	/* try to transform a mov 0, reg to xor reg reg */
963 	if (attr->offset != 0 || attr->symconst != NULL)
964 		return;
965 	if (ia32_cg_config.use_mov_0)
966 		return;
967 	/* xor destroys the flags, so no-one must be using them */
968 	if (be_peephole_get_value(REG_EFLAGS) != NULL)
969 		return;
970 
971 	reg = arch_get_irn_register(node);
972 	assert(be_peephole_get_reg_value(reg) == NULL);
973 
974 	/* create xor(produceval, produceval) */
975 	block = get_nodes_block(node);
976 	dbgi  = get_irn_dbg_info(node);
977 	xorn  = new_bd_ia32_Xor0(dbgi, block);
978 	arch_set_irn_register(xorn, reg);
979 
980 	sched_add_before(node, xorn);
981 
982 	copy_mark(node, xorn);
983 	be_peephole_exchange(node, xorn);
984 }
985 
is_noreg(const ir_node * node)986 static inline int is_noreg(const ir_node *node)
987 {
988 	return is_ia32_NoReg_GP(node);
989 }
990 
ia32_immediate_from_long(long val)991 ir_node *ia32_immediate_from_long(long val)
992 {
993 	ir_graph *irg         = current_ir_graph;
994 	ir_node  *start_block = get_irg_start_block(irg);
995 	ir_node  *immediate
996 		= new_bd_ia32_Immediate(NULL, start_block, NULL, 0, 0, val);
997 	arch_set_irn_register(immediate, &ia32_registers[REG_GP_NOREG]);
998 
999 	return immediate;
1000 }
1001 
create_immediate_from_am(const ir_node * node)1002 static ir_node *create_immediate_from_am(const ir_node *node)
1003 {
1004 	ir_node   *block   = get_nodes_block(node);
1005 	int        offset  = get_ia32_am_offs_int(node);
1006 	int        sc_sign = is_ia32_am_sc_sign(node);
1007 	const ia32_attr_t *attr = get_ia32_attr_const(node);
1008 	int        sc_no_pic_adjust = attr->data.am_sc_no_pic_adjust;
1009 	ir_entity *entity  = get_ia32_am_sc(node);
1010 	ir_node   *res;
1011 
1012 	res = new_bd_ia32_Immediate(NULL, block, entity, sc_sign, sc_no_pic_adjust,
1013 	                            offset);
1014 	arch_set_irn_register(res, &ia32_registers[REG_GP_NOREG]);
1015 	return res;
1016 }
1017 
is_am_one(const ir_node * node)1018 static int is_am_one(const ir_node *node)
1019 {
1020 	int        offset  = get_ia32_am_offs_int(node);
1021 	ir_entity *entity  = get_ia32_am_sc(node);
1022 
1023 	return offset == 1 && entity == NULL;
1024 }
1025 
is_am_minus_one(const ir_node * node)1026 static int is_am_minus_one(const ir_node *node)
1027 {
1028 	int        offset  = get_ia32_am_offs_int(node);
1029 	ir_entity *entity  = get_ia32_am_sc(node);
1030 
1031 	return offset == -1 && entity == NULL;
1032 }
1033 
1034 /**
1035  * Transforms a LEA into an Add or SHL if possible.
1036  */
peephole_ia32_Lea(ir_node * node)1037 static void peephole_ia32_Lea(ir_node *node)
1038 {
1039 	ir_node               *base;
1040 	ir_node               *index;
1041 	const arch_register_t *base_reg;
1042 	const arch_register_t *index_reg;
1043 	const arch_register_t *out_reg;
1044 	int                    scale;
1045 	int                    has_immediates;
1046 	ir_node               *op1;
1047 	ir_node               *op2;
1048 	dbg_info              *dbgi;
1049 	ir_node               *block;
1050 	ir_node               *res;
1051 
1052 	assert(is_ia32_Lea(node));
1053 
1054 	/* we can only do this if it is allowed to clobber the flags */
1055 	if (be_peephole_get_value(REG_EFLAGS) != NULL)
1056 		return;
1057 
1058 	base  = get_irn_n(node, n_ia32_Lea_base);
1059 	index = get_irn_n(node, n_ia32_Lea_index);
1060 
1061 	if (is_noreg(base)) {
1062 		base     = NULL;
1063 		base_reg = NULL;
1064 	} else {
1065 		base_reg = arch_get_irn_register(base);
1066 	}
1067 	if (is_noreg(index)) {
1068 		index     = NULL;
1069 		index_reg = NULL;
1070 	} else {
1071 		index_reg = arch_get_irn_register(index);
1072 	}
1073 
1074 	if (base == NULL && index == NULL) {
1075 		/* we shouldn't construct these in the first place... */
1076 #ifdef DEBUG_libfirm
1077 		ir_fprintf(stderr, "Optimisation warning: found immediate only lea\n");
1078 #endif
1079 		return;
1080 	}
1081 
1082 	out_reg = arch_get_irn_register(node);
1083 	scale   = get_ia32_am_scale(node);
1084 	assert(!is_ia32_need_stackent(node) || get_ia32_frame_ent(node) != NULL);
1085 	/* check if we have immediates values (frame entities should already be
1086 	 * expressed in the offsets) */
1087 	if (get_ia32_am_offs_int(node) != 0 || get_ia32_am_sc(node) != NULL) {
1088 		has_immediates = 1;
1089 	} else {
1090 		has_immediates = 0;
1091 	}
1092 
1093 	/* we can transform leas where the out register is the same as either the
1094 	 * base or index register back to an Add or Shl */
1095 	if (out_reg == base_reg) {
1096 		if (index == NULL) {
1097 #ifdef DEBUG_libfirm
1098 			if (!has_immediates) {
1099 				ir_fprintf(stderr, "Optimisation warning: found lea which is "
1100 				           "just a copy\n");
1101 			}
1102 #endif
1103 			op1 = base;
1104 			goto make_add_immediate;
1105 		}
1106 		if (scale == 0 && !has_immediates) {
1107 			op1 = base;
1108 			op2 = index;
1109 			goto make_add;
1110 		}
1111 		/* can't create an add */
1112 		return;
1113 	} else if (out_reg == index_reg) {
1114 		if (base == NULL) {
1115 			if (has_immediates && scale == 0) {
1116 				op1 = index;
1117 				goto make_add_immediate;
1118 			} else if (!has_immediates && scale > 0) {
1119 				op1 = index;
1120 				op2 = ia32_immediate_from_long(scale);
1121 				goto make_shl;
1122 			} else if (!has_immediates) {
1123 #ifdef DEBUG_libfirm
1124 				ir_fprintf(stderr, "Optimisation warning: found lea which is "
1125 				           "just a copy\n");
1126 #endif
1127 			}
1128 		} else if (scale == 0 && !has_immediates) {
1129 			op1 = index;
1130 			op2 = base;
1131 			goto make_add;
1132 		}
1133 		/* can't create an add */
1134 		return;
1135 	} else {
1136 		/* can't create an add */
1137 		return;
1138 	}
1139 
1140 make_add_immediate:
1141 	if (ia32_cg_config.use_incdec) {
1142 		if (is_am_one(node)) {
1143 			dbgi  = get_irn_dbg_info(node);
1144 			block = get_nodes_block(node);
1145 			res   = new_bd_ia32_Inc(dbgi, block, op1);
1146 			arch_set_irn_register(res, out_reg);
1147 			goto exchange;
1148 		}
1149 		if (is_am_minus_one(node)) {
1150 			dbgi  = get_irn_dbg_info(node);
1151 			block = get_nodes_block(node);
1152 			res   = new_bd_ia32_Dec(dbgi, block, op1);
1153 			arch_set_irn_register(res, out_reg);
1154 			goto exchange;
1155 		}
1156 	}
1157 	op2 = create_immediate_from_am(node);
1158 
1159 make_add:
1160 	dbgi  = get_irn_dbg_info(node);
1161 	block = get_nodes_block(node);
1162 	ir_graph *irg   = get_irn_irg(node);
1163 	ir_node  *noreg = ia32_new_NoReg_gp(irg);
1164 	ir_node  *nomem = get_irg_no_mem(irg);
1165 	res   = new_bd_ia32_Add(dbgi, block, noreg, noreg, nomem, op1, op2);
1166 	arch_set_irn_register(res, out_reg);
1167 	set_ia32_commutative(res);
1168 	goto exchange;
1169 
1170 make_shl:
1171 	dbgi  = get_irn_dbg_info(node);
1172 	block = get_nodes_block(node);
1173 	res   = new_bd_ia32_Shl(dbgi, block, op1, op2);
1174 	arch_set_irn_register(res, out_reg);
1175 	goto exchange;
1176 
1177 exchange:
1178 	SET_IA32_ORIG_NODE(res, node);
1179 
1180 	/* add new ADD/SHL to schedule */
1181 	DBG_OPT_LEA2ADD(node, res);
1182 
1183 	/* exchange the Add and the LEA */
1184 	sched_add_before(node, res);
1185 	copy_mark(node, res);
1186 	be_peephole_exchange(node, res);
1187 }
1188 
1189 /**
1190  * Split a Imul mem, imm into a Load mem and Imul reg, imm if possible.
1191  */
peephole_ia32_Imul_split(ir_node * imul)1192 static void peephole_ia32_Imul_split(ir_node *imul)
1193 {
1194 	const ir_node         *right = get_irn_n(imul, n_ia32_IMul_right);
1195 	const arch_register_t *reg;
1196 	ir_node               *res;
1197 
1198 	if (!is_ia32_Immediate(right) || get_ia32_op_type(imul) != ia32_AddrModeS) {
1199 		/* no memory, imm form ignore */
1200 		return;
1201 	}
1202 	/* we need a free register */
1203 	reg = get_free_gp_reg(get_irn_irg(imul));
1204 	if (reg == NULL)
1205 		return;
1206 
1207 	/* fine, we can rebuild it */
1208 	res = ia32_turn_back_am(imul);
1209 	arch_set_irn_register(res, reg);
1210 }
1211 
1212 /**
1213  * Replace xorps r,r and xorpd r,r by pxor r,r
1214  */
peephole_ia32_xZero(ir_node * xorn)1215 static void peephole_ia32_xZero(ir_node *xorn)
1216 {
1217 	set_irn_op(xorn, op_ia32_xPzero);
1218 }
1219 
1220 /**
1221  * Replace 16bit sign extension from ax to eax by shorter cwtl
1222  */
peephole_ia32_Conv_I2I(ir_node * node)1223 static void peephole_ia32_Conv_I2I(ir_node *node)
1224 {
1225 	const arch_register_t *eax          = &ia32_registers[REG_EAX];
1226 	ir_mode               *smaller_mode = get_ia32_ls_mode(node);
1227 	ir_node               *val          = get_irn_n(node, n_ia32_Conv_I2I_val);
1228 	dbg_info              *dbgi;
1229 	ir_node               *block;
1230 	ir_node               *cwtl;
1231 
1232 	if (get_mode_size_bits(smaller_mode) != 16 ||
1233 			!mode_is_signed(smaller_mode)          ||
1234 			eax != arch_get_irn_register(val)      ||
1235 			eax != arch_get_irn_register_out(node, pn_ia32_Conv_I2I_res))
1236 		return;
1237 
1238 	dbgi  = get_irn_dbg_info(node);
1239 	block = get_nodes_block(node);
1240 	cwtl  = new_bd_ia32_Cwtl(dbgi, block, val);
1241 	arch_set_irn_register(cwtl, eax);
1242 	sched_add_before(node, cwtl);
1243 	be_peephole_exchange(node, cwtl);
1244 }
1245 
1246 /**
1247  * Register a peephole optimisation function.
1248  */
register_peephole_optimisation(ir_op * op,peephole_opt_func func)1249 static void register_peephole_optimisation(ir_op *op, peephole_opt_func func)
1250 {
1251 	assert(op->ops.generic == NULL);
1252 	op->ops.generic = (op_func)func;
1253 }
1254 
1255 /* Perform peephole-optimizations. */
ia32_peephole_optimization(ir_graph * irg)1256 void ia32_peephole_optimization(ir_graph *irg)
1257 {
1258 	/* we currently do it in 2 passes because:
1259 	 *    Lea -> Add could be usefull as flag producer for Test later
1260 	 */
1261 
1262 	/* pass 1 */
1263 	ir_clear_opcodes_generic_func();
1264 	register_peephole_optimisation(op_ia32_Cmp,      peephole_ia32_Cmp);
1265 	register_peephole_optimisation(op_ia32_Cmp8Bit,  peephole_ia32_Cmp);
1266 	register_peephole_optimisation(op_ia32_Lea,      peephole_ia32_Lea);
1267 	if (ia32_cg_config.use_short_sex_eax)
1268 		register_peephole_optimisation(op_ia32_Conv_I2I, peephole_ia32_Conv_I2I);
1269 	if (ia32_cg_config.use_pxor)
1270 		register_peephole_optimisation(op_ia32_xZero, peephole_ia32_xZero);
1271 	if (! ia32_cg_config.use_imul_mem_imm32)
1272 		register_peephole_optimisation(op_ia32_IMul, peephole_ia32_Imul_split);
1273 	be_peephole_opt(irg);
1274 
1275 	/* pass 2 */
1276 	ir_clear_opcodes_generic_func();
1277 	register_peephole_optimisation(op_ia32_Const,    peephole_ia32_Const);
1278 	register_peephole_optimisation(op_be_IncSP,      peephole_be_IncSP);
1279 	register_peephole_optimisation(op_ia32_Test,     peephole_ia32_Test);
1280 	register_peephole_optimisation(op_ia32_Test8Bit, peephole_ia32_Test);
1281 	register_peephole_optimisation(op_be_Return,     peephole_ia32_Return);
1282 	be_peephole_opt(irg);
1283 }
1284 
1285 /**
1286  * Removes node from schedule if it is not used anymore. If irn is a mode_T node
1287  * all its Projs are removed as well.
1288  * @param irn  The irn to be removed from schedule
1289  */
try_kill(ir_node * node)1290 static inline void try_kill(ir_node *node)
1291 {
1292 	if (get_irn_mode(node) == mode_T) {
1293 		foreach_out_edge_safe(node, edge) {
1294 			ir_node *proj = get_edge_src_irn(edge);
1295 			try_kill(proj);
1296 		}
1297 	}
1298 
1299 	if (get_irn_n_edges(node) != 0)
1300 		return;
1301 
1302 	if (sched_is_scheduled(node)) {
1303 		sched_remove(node);
1304 	}
1305 
1306 	kill_node(node);
1307 }
1308 
optimize_conv_store(ir_node * node)1309 static void optimize_conv_store(ir_node *node)
1310 {
1311 	ir_node *pred;
1312 	ir_node *pred_proj;
1313 	ir_mode *conv_mode;
1314 	ir_mode *store_mode;
1315 
1316 	if (!is_ia32_Store(node) && !is_ia32_Store8Bit(node))
1317 		return;
1318 
1319 	assert((int)n_ia32_Store_val == (int)n_ia32_Store8Bit_val);
1320 	pred_proj = get_irn_n(node, n_ia32_Store_val);
1321 	if (is_Proj(pred_proj)) {
1322 		pred = get_Proj_pred(pred_proj);
1323 	} else {
1324 		pred = pred_proj;
1325 	}
1326 	if (!is_ia32_Conv_I2I(pred) && !is_ia32_Conv_I2I8Bit(pred))
1327 		return;
1328 	if (get_ia32_op_type(pred) != ia32_Normal)
1329 		return;
1330 
1331 	/* the store only stores the lower bits, so we only need the conv
1332 	 * it it shrinks the mode */
1333 	conv_mode  = get_ia32_ls_mode(pred);
1334 	store_mode = get_ia32_ls_mode(node);
1335 	if (get_mode_size_bits(conv_mode) < get_mode_size_bits(store_mode))
1336 		return;
1337 
1338 	ir_fprintf(stderr, "Optimisation warning: unoptimized ia32 Store(Conv) (%+F, %+F)\n", node, pred);
1339 	set_irn_n(node, n_ia32_Store_val, get_irn_n(pred, n_ia32_Conv_I2I_val));
1340 	if (get_irn_n_edges(pred_proj) == 0) {
1341 		kill_node(pred_proj);
1342 		if (pred != pred_proj)
1343 			kill_node(pred);
1344 	}
1345 }
1346 
optimize_load_conv(ir_node * node)1347 static void optimize_load_conv(ir_node *node)
1348 {
1349 	ir_node *pred, *predpred;
1350 	ir_mode *load_mode;
1351 	ir_mode *conv_mode;
1352 
1353 	if (!is_ia32_Conv_I2I(node) && !is_ia32_Conv_I2I8Bit(node))
1354 		return;
1355 
1356 	assert((int)n_ia32_Conv_I2I_val == (int)n_ia32_Conv_I2I8Bit_val);
1357 	pred = get_irn_n(node, n_ia32_Conv_I2I_val);
1358 	if (!is_Proj(pred))
1359 		return;
1360 
1361 	predpred = get_Proj_pred(pred);
1362 	if (!is_ia32_Load(predpred))
1363 		return;
1364 
1365 	/* the load is sign extending the upper bits, so we only need the conv
1366 	 * if it shrinks the mode */
1367 	load_mode = get_ia32_ls_mode(predpred);
1368 	conv_mode = get_ia32_ls_mode(node);
1369 	if (get_mode_size_bits(conv_mode) < get_mode_size_bits(load_mode))
1370 		return;
1371 
1372 	if (get_mode_sign(conv_mode) != get_mode_sign(load_mode)) {
1373 		/* change the load if it has only 1 user */
1374 		if (get_irn_n_edges(pred) == 1) {
1375 			ir_mode *newmode;
1376 			if (get_mode_sign(conv_mode)) {
1377 				newmode = find_signed_mode(load_mode);
1378 			} else {
1379 				newmode = find_unsigned_mode(load_mode);
1380 			}
1381 			assert(newmode != NULL);
1382 			set_ia32_ls_mode(predpred, newmode);
1383 		} else {
1384 			/* otherwise we have to keep the conv */
1385 			return;
1386 		}
1387 	}
1388 
1389 	/* kill the conv */
1390 	ir_fprintf(stderr, "Optimisation warning: unoptimized ia32 Conv(Load) (%+F, %+F)\n", node, predpred);
1391 	exchange(node, pred);
1392 }
1393 
optimize_conv_conv(ir_node * node)1394 static void optimize_conv_conv(ir_node *node)
1395 {
1396 	ir_node *pred_proj, *pred, *result_conv;
1397 	ir_mode *pred_mode, *conv_mode;
1398 	int      conv_mode_bits;
1399 	int      pred_mode_bits;
1400 
1401 	if (!is_ia32_Conv_I2I(node) && !is_ia32_Conv_I2I8Bit(node))
1402 		return;
1403 
1404 	assert((int)n_ia32_Conv_I2I_val == (int)n_ia32_Conv_I2I8Bit_val);
1405 	pred_proj = get_irn_n(node, n_ia32_Conv_I2I_val);
1406 	if (is_Proj(pred_proj))
1407 		pred = get_Proj_pred(pred_proj);
1408 	else
1409 		pred = pred_proj;
1410 
1411 	if (!is_ia32_Conv_I2I(pred) && !is_ia32_Conv_I2I8Bit(pred))
1412 		return;
1413 
1414 	/* we know that after a conv, the upper bits are sign extended
1415 	 * so we only need the 2nd conv if it shrinks the mode */
1416 	conv_mode      = get_ia32_ls_mode(node);
1417 	conv_mode_bits = get_mode_size_bits(conv_mode);
1418 	pred_mode      = get_ia32_ls_mode(pred);
1419 	pred_mode_bits = get_mode_size_bits(pred_mode);
1420 
1421 	if (conv_mode_bits == pred_mode_bits
1422 			&& get_mode_sign(conv_mode) == get_mode_sign(pred_mode)) {
1423 		result_conv = pred_proj;
1424 	} else if (conv_mode_bits <= pred_mode_bits) {
1425 		/* if 2nd conv is smaller then first conv, then we can always take the
1426 		 * 2nd conv */
1427 		if (get_irn_n_edges(pred_proj) == 1) {
1428 			result_conv = pred_proj;
1429 			set_ia32_ls_mode(pred, conv_mode);
1430 
1431 			/* Argh:We must change the opcode to 8bit AND copy the register constraints */
1432 			if (get_mode_size_bits(conv_mode) == 8) {
1433 				const arch_register_req_t **reqs = arch_get_irn_register_reqs_in(node);
1434 				set_irn_op(pred, op_ia32_Conv_I2I8Bit);
1435 				arch_set_irn_register_reqs_in(pred, reqs);
1436 			}
1437 		} else {
1438 			/* we don't want to end up with 2 loads, so we better do nothing */
1439 			if (get_irn_mode(pred) == mode_T) {
1440 				return;
1441 			}
1442 
1443 			result_conv = exact_copy(pred);
1444 			set_ia32_ls_mode(result_conv, conv_mode);
1445 
1446 			/* Argh:We must change the opcode to 8bit AND copy the register constraints */
1447 			if (get_mode_size_bits(conv_mode) == 8) {
1448 				const arch_register_req_t **reqs = arch_get_irn_register_reqs_in(node);
1449 				set_irn_op(result_conv, op_ia32_Conv_I2I8Bit);
1450 				arch_set_irn_register_reqs_in(result_conv, reqs);
1451 			}
1452 		}
1453 	} else {
1454 		/* if both convs have the same sign, then we can take the smaller one */
1455 		if (get_mode_sign(conv_mode) == get_mode_sign(pred_mode)) {
1456 			result_conv = pred_proj;
1457 		} else {
1458 			/* no optimisation possible if smaller conv is sign-extend */
1459 			if (mode_is_signed(pred_mode)) {
1460 				return;
1461 			}
1462 			/* we can take the smaller conv if it is unsigned */
1463 			result_conv = pred_proj;
1464 		}
1465 	}
1466 
1467 	ir_fprintf(stderr, "Optimisation warning: unoptimized ia32 Conv(Conv) (%+F, %+F)\n", node, pred);
1468 	/* Some user (like Phis) won't be happy if we change the mode. */
1469 	set_irn_mode(result_conv, get_irn_mode(node));
1470 
1471 	/* kill the conv */
1472 	exchange(node, result_conv);
1473 
1474 	if (get_irn_n_edges(pred_proj) == 0) {
1475 		kill_node(pred_proj);
1476 		if (pred != pred_proj)
1477 			kill_node(pred);
1478 	}
1479 	optimize_conv_conv(result_conv);
1480 }
1481 
optimize_node(ir_node * node,void * env)1482 static void optimize_node(ir_node *node, void *env)
1483 {
1484 	(void) env;
1485 
1486 	optimize_load_conv(node);
1487 	optimize_conv_store(node);
1488 	optimize_conv_conv(node);
1489 }
1490 
1491 /**
1492  * Performs conv and address mode optimization.
1493  */
ia32_optimize_graph(ir_graph * irg)1494 void ia32_optimize_graph(ir_graph *irg)
1495 {
1496 	irg_walk_blkwise_graph(irg, NULL, optimize_node, NULL);
1497 }
1498 
ia32_init_optimize(void)1499 void ia32_init_optimize(void)
1500 {
1501 	FIRM_DBG_REGISTER(dbg, "firm.be.ia32.optimize");
1502 }
1503