1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Yabause - sh2_dynarec.c                                               *
3  *   Copyright (C) 2009-2011 Ari64                                         *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
19  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
20 
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <stdint.h> //include for uint64_t
24 #include <assert.h>
25 #include <string.h> //include for memset
26 
27 #include <sys/mman.h>
28 
29 #include "../memory.h"
30 #include "../sh2core.h"
31 #include "../yabause.h"
32 #include "sh2_dynarec.h"
33 
34 #ifdef __i386__
35 #include "assem_x86.h"
36 #endif
37 #ifdef __x86_64__
38 #include "assem_x64.h"
39 #endif
40 #ifdef __arm__
41 #include "assem_arm.h"
42 #endif
43 
44 #define MAXBLOCK 4096
45 #define MAX_OUTPUT_BLOCK_SIZE 262144
46 #define CLOCK_DIVIDER 1
47 #define SH2_REGS 23
48 
49 struct regstat
50 {
51   signed char regmap_entry[HOST_REGS];
52   signed char regmap[HOST_REGS];
53   u32 wasdirty;
54   u32 dirty;
55   u64 u;
56   u32 wasdoingcp;
57   u32 isdoingcp;
58   u32 cpmap[HOST_REGS];
59   u32 isconst;
60   u32 constmap[SH2_REGS];
61 };
62 
63 struct ll_entry
64 {
65   u32 vaddr;
66   u32 reg32;
67   void *addr;
68   struct ll_entry *next;
69 };
70 
71   u32 start;
72   u16 *source;
73   void *alignedsource;
74   u32 pagelimit;
75   char insn[MAXBLOCK][10];
76   unsigned char itype[MAXBLOCK];
77   unsigned char opcode[MAXBLOCK];
78   unsigned char opcode2[MAXBLOCK];
79   unsigned char opcode3[MAXBLOCK];
80   unsigned char addrmode[MAXBLOCK];
81   unsigned char bt[MAXBLOCK];
82   signed char rs1[MAXBLOCK];
83   signed char rs2[MAXBLOCK];
84   signed char rs3[MAXBLOCK];
85   signed char rt1[MAXBLOCK];
86   signed char rt2[MAXBLOCK];
87   unsigned char us1[MAXBLOCK];
88   unsigned char us2[MAXBLOCK];
89   unsigned char dep1[MAXBLOCK];
90   unsigned char dep2[MAXBLOCK];
91   signed char lt1[MAXBLOCK];
92   int imm[MAXBLOCK];
93   u32 ba[MAXBLOCK];
94   char is_ds[MAXBLOCK];
95   char ooo[MAXBLOCK];
96   u64 unneeded_reg[MAXBLOCK];
97   u64 branch_unneeded_reg[MAXBLOCK];
98   signed char regmap_pre[MAXBLOCK][HOST_REGS];
99   u32 cpmap[MAXBLOCK][HOST_REGS];
100   struct regstat regs[MAXBLOCK];
101   struct regstat branch_regs[MAXBLOCK];
102   signed char minimum_free_regs[MAXBLOCK];
103   u32 needed_reg[MAXBLOCK];
104   u32 wont_dirty[MAXBLOCK];
105   u32 will_dirty[MAXBLOCK];
106   int cycles[MAXBLOCK];
107   int ccadj[MAXBLOCK];
108   int slen;
109   pointer instr_addr[MAXBLOCK];
110   u32 link_addr[MAXBLOCK][3];
111   int linkcount;
112   u32 stubs[MAXBLOCK*3][8];
113   int stubcount;
114   pointer ccstub_return[MAXBLOCK];
115   u32 literals[1024][2];
116   int literalcount;
117   int is_delayslot;
118   u8 *out;
119   struct ll_entry *jump_in[2048];
120   struct ll_entry *jump_out[2048];
121   struct ll_entry *jump_dirty[2048];
122   ALIGNED(16) u32 hash_table[65536][4];
123   ALIGNED(16) char shadow[2097152];
124   char *copy;
125   int expirep;
126   unsigned int stop_after_jal;
127   //char invalid_code[0x100000];
128   char cached_code[0x20000];
129   char cached_code_words[2048*128];
130   u32 recent_writes[8];
131   u32 recent_write_index=0;
132   unsigned int slave;
133   u32 invalidate_count;
134   extern int master_reg[22];
135   extern int master_cc;
136   extern int master_pc; // Virtual PC
137   extern void * master_ip; // Translated PC
138   extern int slave_reg[22];
139   extern int slave_cc;
140   extern int slave_pc; // Virtual PC
141   extern void * slave_ip; // Translated PC
142   extern u8 restore_candidate[512];
143 
144   /* registers that may be allocated */
145   /* 0-15 gpr */
146 #define SR   16 // Status register, including T bit
147 #define GBR  17 // Global base register
148 #define VBR  18 // Vector base register
149 #define MACH 19 // MACH
150 #define MACL 20 // MACL
151 #define PR   21 // Return address
152 #define TBIT 22 // T bit, seperate from SR
153 
154 #define CCREG 23 // Cycle count
155 #define MMREG 24 // Pointer to memory_map
156 #define TEMPREG 25
157 #define PTEMP 25 // Prefetch temporary register
158 #define MOREG 26 // offset from memory_map
159 #define RHASH 27 // Return address hash
160 #define RHTBL 28 // Return address hash table address
161 #define RTEMP 29 // BRAF/BSRF address register
162 #define MAXREG 29
163 #define AGEN1 30 // Address generation temporary register
164 #define AGEN2 31 // Address generation temporary register
165 #define MGEN1 32 // Maptable address generation temporary register
166 #define MGEN2 33 // Maptable address generation temporary register
167 
168   /* instruction types */
169 #define NOP 0     // No operation
170 #define LOAD 1    // Load
171 #define STORE 2   // Store
172 #define RMW 3     // Read-Modify-Write
173 #define PCREL 4   // PC-relative Load
174 #define MOV 5     // Move
175 #define ALU 6     // Arithmetic/logic
176 #define MULTDIV 7 // Multiply/divide
177 #define SHIFTIMM 8// Shift by immediate
178 #define IMM8 9    // 8-bit immediate
179 #define EXT 10    // Sign/Zero Extension
180 #define FLAGS 11  // SETT/CLRT/MOVT
181 #define UJUMP 12  // Unconditional jump
182 #define RJUMP 13  // Unconditional jump to register
183 #define CJUMP 14  // Conditional branch (BT/BF)
184 #define SJUMP 15  // Conditional branch with delay slot
185 #define COMPLEX 16// Complex instructions (function call)
186 #define SYSTEM 17 // Halt/Trap/Exception
187 #define SYSCALL 18// SYSCALL (TRAPA)
188 #define NI 19     // Not implemented
189 #define DATA 20   // Constant pool data not decoded as instructions
190 #define BIOS 21   // Emulate BIOS function
191 
192   /* addressing modes */
193 #define REGIND 1  // @Rn
194 #define POSTINC 2 // @Rn+
195 #define PREDEC 3  // @-Rm
196 #define DUALIND 4 // @(R0,Rn)
197 #define GBRIND 5  // @(R0,GBR)
198 #define GBRDISP 6 // @(disp,GBR)
199 #define REGDISP 7 // @(disp,Rn)
200 
201   /* stubs */
202 #define CC_STUB 1
203 #define FP_STUB 2
204 #define LOADB_STUB 3
205 #define LOADW_STUB 4
206 #define LOADL_STUB 5
207 #define LOADS_STUB 6
208 #define STOREB_STUB 7
209 #define STOREW_STUB 8
210 #define STOREL_STUB 9
211 #define RMWT_STUB 10
212 #define RMWA_STUB 11
213 #define RMWX_STUB 12
214 #define RMWO_STUB 13
215 
216   /* branch codes */
217 #define TAKEN 1
218 #define NOTTAKEN 2
219 #define NODS 3
220 
221 // asm linkage
222 int sh2_recompile_block(int addr);
223 void *get_addr_ht(u32 vaddr);
224 void get_bounds(pointer addr,u32 *start,u32 *end);
225 void invalidate_addr(u32 addr);
226 void remove_hash(int vaddr);
227 void dyna_linker();
228 void verify_code();
229 void cc_interrupt();
230 void cc_interrupt_master();
231 void slave_entry();
232 void div1();
233 void macl();
234 void macw();
235 void master_handle_bios();
236 void slave_handle_bios();
237 
238 // Needed by assembler
239 void wb_register(signed char r,signed char regmap[],u32 dirty);
240 void wb_dirtys(signed char i_regmap[],u32 i_dirty);
241 void wb_needed_dirtys(signed char i_regmap[],u32 i_dirty,int addr);
242 void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2,int rs3);
243 void load_all_regs(signed char i_regmap[]);
244 void load_needed_regs(signed char i_regmap[],signed char next_regmap[]);
245 void load_regs_entry(int t);
246 void load_all_consts(signed char regmap[],u32 dirty,int i);
247 
248 int tracedebug=0;
249 
250 //#define DEBUG_CYCLE_COUNT 1
251 
nullf(const char * format,...)252 void nullf(const char *format, ...) {}
253 //#define assem_debug printf
254 //#define inv_debug printf
255 #define assem_debug nullf
256 #define inv_debug nullf
257 
258 
259 // Get address from virtual address
260 // This is called from the recompiled BRAF/BSRF instructions
get_addr(u32 vaddr)261 void *get_addr(u32 vaddr)
262 {
263   struct ll_entry *head;
264   u32 page=(vaddr&0xDFFFFFFF)>>12;
265   if(page>1024) page=1024+(page&1023);
266   //printf("TRACE: count=%d next=%d (get_addr %x,page %d)\n",Count,next_interupt,vaddr,page);
267   head=jump_in[page];
268   while(head!=NULL) {
269   //printf("TRACE: (get_addr check %x: %x)\n",vaddr,(int)head->addr);
270     if(head->vaddr==vaddr) {
271   //printf("TRACE: count=%d next=%d (get_addr match %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
272   //printf("TRACE: (get_addr match %x: %x)\n",vaddr,(int)head->addr);
273       u32 *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
274       ht_bin[3]=ht_bin[1];
275       ht_bin[2]=ht_bin[0];
276       ht_bin[1]=(int)head->addr;
277       ht_bin[0]=vaddr;
278       //printf("TRACE: get_addr clean (%x,%x)\n",vaddr,(int)head->addr);
279       return head->addr;
280     }
281     head=head->next;
282   }
283   head=jump_dirty[page];
284   while(head!=NULL) {
285     if(head->vaddr==vaddr) {
286       //printf("TRACE: count=%d next=%d (get_addr match dirty %x: %x)\n",Count,next_interupt,vaddr,(int)head->addr);
287       // Don't restore blocks which are about to expire from the cache
288       if((((u32)head->addr-(u32)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
289       if(verify_dirty((pointer)head->addr)) {
290         u32 start,end;
291         u32 *ht_bin;
292         //printf("restore candidate: %x (%d) d=%d\n",vaddr,page,(cached_code[vaddr>>15]>>((vaddr>>12)&7))&1);
293         //invalid_code[vaddr>>12]=0;
294         cached_code[vaddr>>15]|=1<<((vaddr>>12)&7);
295         cached_code[(vaddr^0x20000000)>>15]|=1<<((vaddr>>12)&7);
296         #ifdef POINTERS_64BIT
297         memory_map[vaddr>>12]|=0x4000000000000000LL;
298         memory_map[(vaddr^0x20000000)>>12]|=0x4000000000000000LL;
299         #else
300         memory_map[vaddr>>12]|=0x40000000;
301         memory_map[(vaddr^0x20000000)>>12]|=0x40000000;
302         #endif
303         restore_candidate[page>>3]|=1<<(page&7);
304         get_bounds((pointer)head->addr,&start,&end);
305         if(start-(u32)HighWram<0x100000) {
306           u32 vstart=start-(u32)HighWram+0x6000000;
307           u32 vend=end-(u32)HighWram+0x6000000;
308           int i;
309           //printf("write protect: start=%x, end=%x\n",vstart,vend);
310           for(i=0;i<vend-vstart;i+=4) {
311             cached_code_words[((vstart<4194304?vstart:((vstart|0x400000)&0x7fffff))+i)>>5]|=1<<(((vstart+i)>>2)&7);
312           }
313         }
314         if(start-(u32)LowWram<0x100000) {
315           u32 vstart=start-(u32)LowWram+0x200000;
316           u32 vend=end-(u32)LowWram+0x200000;
317           int i;
318           //printf("write protect: start=%x, end=%x\n",vstart,vend);
319           for(i=0;i<vend-vstart;i+=4) {
320             cached_code_words[((vstart<4194304?vstart:((vstart|0x400000)&0x7fffff))+i)>>5]|=1<<(((vstart+i)>>2)&7);
321           }
322         }
323         ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
324         if(ht_bin[0]==vaddr) {
325           ht_bin[1]=(int)head->addr; // Replace existing entry
326         }
327         else
328         {
329           ht_bin[3]=ht_bin[1];
330           ht_bin[2]=ht_bin[0];
331           ht_bin[1]=(int)head->addr;
332           ht_bin[0]=vaddr;
333         }
334         //printf("TRACE: get_addr dirty (%x,%x)\n",vaddr,(int)head->addr);
335         return head->addr;
336       }
337     }
338     head=head->next;
339   }
340   sh2_recompile_block(vaddr);
341   return get_addr(vaddr);
342 }
343 // Look up address in hash table first
get_addr_ht(u32 vaddr)344 void *get_addr_ht(u32 vaddr)
345 {
346   //printf("TRACE: count=%d next=%d (get_addr_ht %x)\n",Count,next_interupt,vaddr);
347   //if(vaddr>>12==0x60a0) printf("TRACE: (get_addr_ht %x)\n",vaddr);
348   u32 *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
349   //if(vaddr>>12==0x60a0) printf("%x %x %x %x\n",ht_bin[0],ht_bin[1],ht_bin[2],ht_bin[3]);
350   if(ht_bin[0]==vaddr) return (void *)ht_bin[1];
351   if(ht_bin[2]==vaddr) return (void *)ht_bin[3];
352   return get_addr(vaddr);
353 }
354 
clear_all_regs(signed char regmap[])355 void clear_all_regs(signed char regmap[])
356 {
357   int hr;
358   for (hr=0;hr<HOST_REGS;hr++) regmap[hr]=-1;
359 }
360 
get_reg(signed char regmap[],int r)361 signed char get_reg(signed char regmap[],int r)
362 {
363   int hr;
364   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
365   return -1;
366 }
367 
368 // Get a second temporary register (hopefully different from the first)
get_alt_reg(signed char regmap[],int r)369 signed char get_alt_reg(signed char regmap[],int r)
370 {
371   int hr;
372   for (hr=HOST_REGS-1;hr>=0;hr--) if(hr!=EXCLUDE_REG&&regmap[hr]==r) return hr;
373   return -1;
374 }
375 
376 // Find a register that is available for two consecutive cycles
get_reg2(signed char regmap1[],signed char regmap2[],int r)377 signed char get_reg2(signed char regmap1[],signed char regmap2[],int r)
378 {
379   int hr;
380   for (hr=0;hr<HOST_REGS;hr++) if(hr!=EXCLUDE_REG&&regmap1[hr]==r&&regmap2[hr]==r) return hr;
381   return -1;
382 }
383 
count_free_regs(signed char regmap[])384 int count_free_regs(signed char regmap[])
385 {
386   int count=0;
387   int hr;
388   for(hr=0;hr<HOST_REGS;hr++)
389   {
390     if(hr!=EXCLUDE_REG) {
391       if(regmap[hr]<0) count++;
392     }
393   }
394   return count;
395 }
396 
dirty_reg(struct regstat * cur,signed char reg)397 void dirty_reg(struct regstat *cur,signed char reg)
398 {
399   int hr;
400   if(reg<0) return;
401   for (hr=0;hr<HOST_REGS;hr++) {
402     if((cur->regmap[hr]&63)==reg) {
403       cur->dirty|=1<<hr;
404     }
405   }
406 }
407 
set_const(struct regstat * cur,signed char reg,u64 value)408 void set_const(struct regstat *cur,signed char reg,u64 value)
409 {
410   int hr;
411   if(reg<0) return;
412   for (hr=0;hr<HOST_REGS;hr++) {
413     if(cur->regmap[hr]==reg) {
414       cur->isdoingcp|=1<<hr;
415       cur->cpmap[hr]=value;
416     }
417     else if((cur->regmap[hr]^64)==reg) {
418       cur->isdoingcp|=1<<hr;
419       cur->cpmap[hr]=value>>32;
420     }
421   }
422 }
423 
clear_const(struct regstat * cur,signed char reg)424 void clear_const(struct regstat *cur,signed char reg)
425 {
426   int hr;
427   if(reg<0) return;
428   for (hr=0;hr<HOST_REGS;hr++) {
429     if((cur->regmap[hr]&63)==reg) {
430       cur->isdoingcp&=~(1<<hr);
431     }
432   }
433 }
434 
is_const(struct regstat * cur,signed char reg)435 int is_const(struct regstat *cur,signed char reg)
436 {
437   int hr;
438   if(reg<0) return 0;
439   for (hr=0;hr<HOST_REGS;hr++) {
440     if((cur->regmap[hr]&63)==reg) {
441       return (cur->isdoingcp>>hr)&1;
442     }
443   }
444   return 0;
445 }
get_const(struct regstat * cur,signed char reg)446 u64 get_const(struct regstat *cur,signed char reg)
447 {
448   int hr;
449   if(reg<0) return 0;
450   for (hr=0;hr<HOST_REGS;hr++) {
451     if(cur->regmap[hr]==reg) {
452       return cur->cpmap[hr];
453     }
454   }
455   printf("Unknown constant in r%d\n",reg);
456   exit(1);
457 }
458 
sh2_set_const(u32 * isconst,u32 * constmap,signed char reg,u64 value)459 void sh2_set_const(u32 *isconst,u32 *constmap,signed char reg,u64 value)
460 {
461   *isconst|=1<<reg;
462   constmap[reg]=value;
463 }
464 
sh2_clear_const(u32 * isconst,u32 * constmap,signed char reg)465 void sh2_clear_const(u32 *isconst,u32 *constmap,signed char reg)
466 {
467   if(reg<0) return;
468   *isconst&=~(1<<reg);
469 }
470 
471 
472 // Least soon needed registers
473 // Look at the next ten instructions and see which registers
474 // will be used.  Try not to reallocate these.
lsn(unsigned char hsn[],int i,int * preferred_reg)475 void lsn(unsigned char hsn[], int i, int *preferred_reg)
476 {
477   int j;
478   int b=-1;
479   for(j=0;j<9;j++)
480   {
481     if(i+j>=slen) {
482       j=slen-i-1;
483       break;
484     }
485     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP)
486     {
487       // Don't go past an unconditonal jump
488       j++;
489       break;
490     }
491   }
492   for(;j>=0;j--)
493   {
494     if(rs1[i+j]>=0) hsn[rs1[i+j]]=j;
495     if(rs2[i+j]>=0) hsn[rs2[i+j]]=j;
496     if(rs3[i+j]>=0) hsn[rs3[i+j]]=j;
497     if(rt1[i+j]>=0) hsn[rt1[i+j]]=j;
498     if(rt2[i+j]>=0) hsn[rt2[i+j]]=j;
499     if(rs1[i+j]==TBIT) hsn[SR]=j;
500     if(rs2[i+j]==TBIT) hsn[SR]=j;
501     if(rs3[i+j]==TBIT) hsn[SR]=j;
502     if(rt1[i+j]==TBIT) hsn[SR]=j;
503     if(rt2[i+j]==TBIT) hsn[SR]=j;
504     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
505     {
506       hsn[CCREG]=j;
507       b=j;
508     }
509   }
510   if(b>=0)
511   {
512     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
513     {
514       // Follow first branch
515       int t=(ba[i+b]-start)>>2;
516       j=7-b;if(t+j>=slen) j=slen-t-1;
517       for(;j>=0;j--)
518       {
519         if(rs1[t+j]>=0) if(hsn[rs1[t+j]]>j+b+2) hsn[rs1[t+j]]=j+b+2;
520         if(rs2[t+j]>=0) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
521         if(rs3[t+j]>=0) if(hsn[rs2[t+j]]>j+b+2) hsn[rs2[t+j]]=j+b+2;
522         //if(rt1[t+j]) if(hsn[rt1[t+j]]>j+b+2) hsn[rt1[t+j]]=j+b+2;
523         //if(rt2[t+j]) if(hsn[rt2[t+j]]>j+b+2) hsn[rt2[t+j]]=j+b+2;
524       }
525     }
526     // TODO: preferred register based on backward branch
527   }
528   // Delay slot should preferably not overwrite branch conditions or cycle count
529   if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP||itype[i-1]==SJUMP)) {
530     if(rs1[i-1]>=0) if(hsn[rs1[i-1]]>1) hsn[rs1[i-1]]=1;
531     if(rs2[i-1]>=0) if(hsn[rs2[i-1]]>1) hsn[rs2[i-1]]=1;
532     if(rs3[i-1]>=0) if(hsn[rs3[i-1]]>1) hsn[rs3[i-1]]=1;
533     if(itype[i-1]==SJUMP) if(hsn[SR]>1) hsn[SR]=1;
534     hsn[CCREG]=1;
535     // ...or hash tables
536     hsn[RHASH]=1;
537     hsn[RHTBL]=1;
538     // .. or branch target
539     hsn[RTEMP]=1;
540   }
541   // If reading/writing T bit, need SR
542   if(rs1[i]==TBIT||rs2[i]==TBIT||rt1[i]==TBIT||rt2[i]==TBIT) {
543     hsn[SR]=0;
544   }
545   // Don't remove the memory_map registers either
546   if(itype[i]==LOAD || itype[i]==STORE || itype[i]==RMW || itype[i]==PCREL) {
547     hsn[MOREG]=0;
548   }
549   if(itype[i]==UJUMP || itype[i]==RJUMP || itype[i]==SJUMP)
550   {
551     if(itype[i+1]==LOAD || itype[i+1]==STORE || itype[i+1]==RMW || itype[i+1]==PCREL) {
552       hsn[MOREG]=0;
553     }
554   }
555   if(itype[i]==SYSTEM && opcode[i]==12) { // TRAPA
556     hsn[MOREG]=0;
557   }
558   // Don't remove the miniht registers
559   if(itype[i]==UJUMP||itype[i]==RJUMP)
560   {
561     hsn[RHASH]=0;
562     hsn[RHTBL]=0;
563     // or branch target
564     hsn[RTEMP]=0;
565   }
566 }
567 
568 // We only want to allocate registers if we're going to use them again soon
needed_again(int r,int i)569 int needed_again(int r, int i)
570 {
571   int j;
572   int b=-1;
573   int rn=10;
574 
575   if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP))
576   {
577     if(ba[i-1]<start || ba[i-1]>start+slen*4-4)
578       return 0; // Don't need any registers if exiting the block
579   }
580   for(j=0;j<9;j++)
581   {
582     if(i+j>=slen) {
583       j=slen-i-1;
584       break;
585     }
586     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP)
587     {
588       // Don't go past an unconditonal jump
589       j++;
590       break;
591     }
592     if(itype[i+j]==SYSCALL||itype[i+j]==SYSTEM)
593     {
594       break;
595     }
596   }
597   for(;j>=1;j--)
598   {
599     if(rs1[i+j]==r) rn=j;
600     if(rs2[i+j]==r) rn=j;
601     if((unneeded_reg[i+j]>>r)&1) rn=10;
602     if(i+j>=0&&(itype[i+j]==UJUMP||itype[i+j]==CJUMP||itype[i+j]==SJUMP))
603     {
604       b=j;
605     }
606   }
607   /*
608   if(b>=0)
609   {
610     if(ba[i+b]>=start && ba[i+b]<(start+slen*4))
611     {
612       // Follow first branch
613       int o=rn;
614       int t=(ba[i+b]-start)>>2;
615       j=7-b;if(t+j>=slen) j=slen-t-1;
616       for(;j>=0;j--)
617       {
618         if(!((unneeded_reg[t+j]>>r)&1)) {
619           if(rs1[t+j]==r) if(rn>j+b+2) rn=j+b+2;
620           if(rs2[t+j]==r) if(rn>j+b+2) rn=j+b+2;
621         }
622         else rn=o;
623       }
624     }
625   }*/
626   if(rn<10) return 1;
627   return 0;
628 }
629 
630 // Try to match register allocations at the end of a loop with those
631 // at the beginning
loop_reg(int i,int r,int hr)632 int loop_reg(int i, int r, int hr)
633 {
634   int j,k;
635   for(j=0;j<9;j++)
636   {
637     if(i+j>=slen) {
638       j=slen-i-1;
639       break;
640     }
641     if(itype[i+j]==UJUMP||itype[i+j]==RJUMP)
642     {
643       // Don't go past an unconditonal jump
644       j++;
645       break;
646     }
647   }
648   k=0;
649   if(i>0){
650     if(itype[i-1]==UJUMP||itype[i-1]==CJUMP||itype[i-1]==SJUMP)
651       k--;
652   }
653   for(;k<j;k++)
654   {
655     if(r<64&&((unneeded_reg[i+k]>>r)&1)) return hr;
656     if(i+k>=0&&(itype[i+k]==UJUMP||itype[i+k]==CJUMP||itype[i+k]==SJUMP))
657     {
658       if(ba[i+k]>=start && ba[i+k]<(start+i*2))
659       {
660         int t=(ba[i+k]-start)>>1;
661         int reg=get_reg(regs[t].regmap_entry,r);
662         if(reg>=0) return reg;
663         //reg=get_reg(regs[t+1].regmap_entry,r);
664         //if(reg>=0) return reg;
665       }
666     }
667   }
668   return hr;
669 }
670 
671 
672 // Allocate every register, preserving source/target regs
alloc_all(struct regstat * cur,int i)673 void alloc_all(struct regstat *cur,int i)
674 {
675   int hr;
676 
677   for(hr=0;hr<HOST_REGS;hr++) {
678     if(hr!=EXCLUDE_REG) {
679       if(((cur->regmap[hr]&63)!=rs1[i])&&((cur->regmap[hr]&63)!=rs2[i])&&((cur->regmap[hr]&63)!=rs3[i])&&
680          ((cur->regmap[hr]&63)!=rt1[i])&&((cur->regmap[hr]&63)!=rt2[i]))
681       {
682         cur->regmap[hr]=-1;
683         cur->dirty&=~(1<<hr);
684       }
685     }
686   }
687 }
688 
can_direct_read(int address)689 int can_direct_read(int address)
690 {
691   if((address&0xDFF00000)==0x200000) return 1;
692   if((address&0xDE000000)==0x6000000) return 1;
693   if((address&0xDFF00000)==0) return 1;
694   return 0;
695 }
696 
can_direct_write(int address)697 int can_direct_write(int address)
698 {
699   if((address&0xDFF00000)==0x200000) return 1;
700   if((address&0xDE000000)==0x6000000) return 1;
701   return 0;
702 }
703 
map_address(u32 address)704 static pointer map_address(u32 address)
705 {
706   if((address&0xDFF00000)==0x200000) return (pointer)LowWram+(address&0xFFFFF);
707   if((address&0xDE000000)==0x6000000) return (pointer)HighWram+(address&0xFFFFF);
708   assert((address&0xDFF00000)==0);
709   return (pointer)BiosRom+(address&0x8FFFF);
710 }
711 
712 #ifdef __i386__
713 #include "assem_x86.c"
714 #endif
715 #ifdef __x86_64__
716 #include "assem_x64.c"
717 #endif
718 #ifdef __arm__
719 #include "assem_arm.c"
720 #endif
721 
722 // Add virtual address mapping to linked list
ll_add(struct ll_entry ** head,int vaddr,void * addr)723 void ll_add(struct ll_entry **head,int vaddr,void *addr)
724 {
725   struct ll_entry *new_entry;
726   new_entry=malloc(sizeof(struct ll_entry));
727   assert(new_entry!=NULL);
728   new_entry->vaddr=vaddr;
729   new_entry->reg32=0;
730   new_entry->addr=addr;
731   new_entry->next=*head;
732   *head=new_entry;
733 }
734 
735 // Add to linked list only if there is not an existing record
ll_add_nodup(struct ll_entry ** head,int vaddr,void * addr)736 void ll_add_nodup(struct ll_entry **head,int vaddr,void *addr)
737 {
738   struct ll_entry *ptr;
739   ptr=*head;
740   while(ptr!=NULL) {
741     if(ptr->vaddr==vaddr) {
742       return;
743     }
744     ptr=ptr->next;
745   }
746   ll_add(head,vaddr,addr);
747 }
748 
749 // Check if an address is already compiled
750 // but don't return addresses which are about to expire from the cache
check_addr(u32 vaddr)751 void *check_addr(u32 vaddr)
752 {
753   struct ll_entry *head;
754   u32 page;
755   u32 *ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
756   if(ht_bin[0]==vaddr) {
757     if(((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE-(u32)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
758       if(isclean(ht_bin[1])) return (void *)ht_bin[1];
759   }
760   if(ht_bin[2]==vaddr) {
761     if(((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE-(u32)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2)))
762       if(isclean(ht_bin[3])) return (void *)ht_bin[3];
763   }
764   page=(vaddr&0xDFFFFFFF)>>12;
765   if(page>1024) page=1024+(page&1023);
766   head=jump_in[page];
767   while(head!=NULL) {
768     if(head->vaddr==vaddr) {
769       if((((u32)head->addr-(u32)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
770         // Update existing entry with current address
771         if(ht_bin[0]==vaddr) {
772           ht_bin[1]=(int)head->addr;
773           return head->addr;
774         }
775         if(ht_bin[2]==vaddr) {
776           ht_bin[3]=(int)head->addr;
777           return head->addr;
778         }
779         // Insert into hash table with low priority.
780         // Don't evict existing entries, as they are probably
781         // addresses that are being accessed frequently.
782         if(ht_bin[0]==-1) {
783           ht_bin[1]=(int)head->addr;
784           ht_bin[0]=vaddr;
785         }else if(ht_bin[2]==-1) {
786           ht_bin[3]=(int)head->addr;
787           ht_bin[2]=vaddr;
788         }
789         return head->addr;
790       }
791     }
792     head=head->next;
793   }
794   return 0;
795 }
796 
remove_hash(int vaddr)797 void remove_hash(int vaddr)
798 {
799   //printf("remove hash: %x\n",vaddr);
800   u32 *ht_bin=hash_table[(((vaddr)>>16)^vaddr)&0xFFFF];
801   if(ht_bin[2]==vaddr) {
802     ht_bin[2]=ht_bin[3]=-1;
803   }
804   if(ht_bin[0]==vaddr) {
805     ht_bin[0]=ht_bin[2];
806     ht_bin[1]=ht_bin[3];
807     ht_bin[2]=ht_bin[3]=-1;
808   }
809 }
810 
ll_remove_matching_addrs(struct ll_entry ** head,int addr,int shift)811 void ll_remove_matching_addrs(struct ll_entry **head,int addr,int shift)
812 {
813   struct ll_entry *next;
814   while(*head) {
815     if(((u32)((*head)->addr)>>shift)==(addr>>shift) ||
816        ((u32)(((char *)(*head)->addr)-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift))
817     {
818       inv_debug("EXP: Remove pointer to %x (%x)\n",(int)(*head)->addr,(*head)->vaddr);
819       remove_hash((*head)->vaddr);
820       next=(*head)->next;
821       free(*head);
822       *head=next;
823     }
824     else
825     {
826       head=&((*head)->next);
827     }
828   }
829 }
830 
831 // Remove all entries from linked list
ll_clear(struct ll_entry ** head)832 void ll_clear(struct ll_entry **head)
833 {
834   struct ll_entry *cur;
835   struct ll_entry *next;
836   if((cur=*head)) {
837     *head=0;
838     while(cur) {
839       next=cur->next;
840       free(cur);
841       cur=next;
842     }
843   }
844 }
845 
846 // Dereference the pointers and remove if it matches
ll_kill_pointers(struct ll_entry * head,int addr,int shift)847 void ll_kill_pointers(struct ll_entry *head,int addr,int shift)
848 {
849   while(head) {
850     int ptr=get_pointer(head->addr);
851     inv_debug("EXP: Lookup pointer to %x at %x (%x)\n",(int)ptr,(int)head->addr,head->vaddr);
852     if(((ptr>>shift)==(addr>>shift)) ||
853        (((ptr-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(addr>>shift)))
854     {
855       u32 host_addr;
856       inv_debug("EXP: Kill pointer at %x (%x)\n",(int)head->addr,head->vaddr);
857       host_addr=(u32)kill_pointer(head->addr);
858       #ifdef __arm__
859         needs_clear_cache[(host_addr-(u32)BASE_ADDR)>>17]|=1<<(((host_addr-(u32)BASE_ADDR)>>12)&31);
860       #endif
861     }
862     head=head->next;
863   }
864 }
865 
866 // This is called when we write to a compiled block
invalidate_page(u32 page)867 void invalidate_page(u32 page)
868 {
869   struct ll_entry *head;
870   struct ll_entry *next;
871   head=jump_in[page];
872   jump_in[page]=0;
873   while(head!=NULL) {
874     inv_debug("INVALIDATE: %x\n",head->vaddr);
875     remove_hash(head->vaddr);
876     next=head->next;
877     free(head);
878     head=next;
879   }
880   head=jump_out[page];
881   jump_out[page]=0;
882   while(head!=NULL) {
883     u32 host_addr;
884     inv_debug("INVALIDATE: kill pointer to %x (%x)\n",head->vaddr,(int)head->addr);
885     host_addr=(u32)kill_pointer(head->addr);
886     #ifdef __arm__
887       needs_clear_cache[(host_addr-(u32)BASE_ADDR)>>17]|=1<<(((host_addr-(u32)BASE_ADDR)>>12)&31);
888     #endif
889     next=head->next;
890     free(head);
891     head=next;
892   }
893 }
894 
invalidate_blocks(u32 firstblock,u32 lastblock)895 void invalidate_blocks(u32 firstblock,u32 lastblock)
896 {
897   u32 page;
898   int block;
899   u32 first,last;
900   first=firstblock<1024?firstblock:1024+(firstblock&1023);
901   last=lastblock<1024?lastblock:1024+(lastblock&1023);
902   // Invalidate the adjacent pages if a block crosses a 4K boundary
903   for(block=firstblock;block<=lastblock;block++) {
904     struct ll_entry *head;
905     page=block&0xDFFFF;
906     if(page>1024) page=1024+(page&1023);
907     inv_debug("INVALIDATE: %x..%x (%d)\n",firstblock<<12,lastblock<<12,page);
908     //inv_debug("invalid_code[block]=%d\n",invalid_code[block]);
909     head=jump_dirty[page];
910     //printf("page=%d vpage=%d\n",page,vpage);
911     while(head!=NULL) {
912       u32 start,end;
913       if((head->vaddr>>12)==block) { // Ignore vaddr hash collision
914         get_bounds((pointer)head->addr,&start,&end);
915         //printf("start: %x end: %x\n",start,end);
916         if(start>=(u32)LowWram&&end<(u32)LowWram+1048576) {
917           if(((start-(u32)LowWram)>>12)<=page&&((end-1-(u32)LowWram)>>12)>=page) {
918             if((((start-(u32)LowWram)>>12)+512)<first) first=((start-(u32)LowWram)>>12)&1023;
919             if((((end-1-(u32)LowWram)>>12)+512)>last) last=((end-1-(u32)LowWram)>>12)&1023;
920           }
921         }
922         // FIXME: Aliasing/mirroring is wrong here
923         if(start>=(u32)HighWram&&end<(u32)HighWram+1048576) {
924           if(((start-(u32)HighWram)>>12)<=page-1024&&((end-1-(u32)HighWram)>>12)>=page-1024) {
925             if((((start-(u32)HighWram)>>12)&255)<first-1024) first=(((start-(u32)HighWram)>>12)&255)+1024;
926             if((((end-1-(u32)HighWram)>>12)&255)>last-1024) last=(((end-1-(u32)HighWram)>>12)&255)+1024;
927           }
928         }
929       }
930       head=head->next;
931     }
932   }
933   //printf("first=%d last=%d\n",first,last);
934   while(first<=last) {
935     invalidate_page(first);
936     first++;
937   }
938   #ifdef __arm__
939     do_clear_cache();
940   #endif
941 
942   for(block=firstblock;block<=lastblock;block++) {
943     // Don't trap writes
944     cached_code[block>>3]&=~(1<<(block&7));
945     cached_code[(block^0x20000)>>3]&=~(1<<(block&7));
946 
947     #ifdef POINTERS_64BIT
948     if((block>=0x0200&&block<0x0300)||(block>=0x20200&&block<0x20300)) {
949       memory_map[block]=((u64)LowWram-((block<<12)&0xFFF00000))>>2;
950       memory_map[block^0x20000]=((u64)LowWram-(((block^0x20000)<<12)&0xFFF00000))>>2;
951     }
952     if((block>=0x6000&&block<0x8000)||(block>=0x26000&&block<0x28000)) {
953       memory_map[block]=((u64)HighWram-((block<<12)&0xFFF00000))>>2;
954       memory_map[block^0x20000]=((u64)HighWram-(((block^0x20000)<<12)&0xFFF00000))>>2;
955     }
956     #else
957     if((block>=0x0200&&block<0x0300)||(block>=0x20200&&block<0x20300)) {
958       memory_map[block]=((u32)LowWram-((block<<12)&0xFFF00000))>>2;
959       memory_map[block^0x20000]=((u32)LowWram-(((block^0x20000)<<12)&0xFFF00000))>>2;
960     }
961     if((block>=0x6000&&block<0x8000)||(block>=0x26000&&block<0x28000)) {
962       memory_map[block]=((u32)HighWram-((block<<12)&0xFFF00000))>>2;
963       memory_map[block^0x20000]=((u32)HighWram-(((block^0x20000)<<12)&0xFFF00000))>>2;
964     }
965     #endif
966     page=block&0xDFFFF;
967     if(page>1024) page=1024+(page&1023);
968     memset(cached_code_words+(page<<7),0,128);
969   }
970   #ifdef USE_MINI_HT
971   memset(mini_ht_master,-1,sizeof(mini_ht_master));
972   memset(mini_ht_slave,-1,sizeof(mini_ht_slave));
973   #endif
974 }
invalidate_addr(u32 addr)975 void invalidate_addr(u32 addr)
976 {
977   u32 index=addr&0xDFFFFFFF;
978   if(index>4194304) index=(addr|0x400000)&0x7fffff;
979   if(!((cached_code_words[index>>5]>>((index>>2)&7))&1)) {
980     // If we get an excessive number of these,
981     // then we probably do want to invalidate the page
982     if(invalidate_count++<500) {
983       if((restore_candidate[index>>15]>>((index>>12)&7))&1) {
984         recent_writes[recent_write_index]=addr;
985         recent_write_index=(recent_write_index+1)&7;
986       }
987       return;
988     }
989   }
990   //printf("invalidate_count: %d\n",invalidate_count);
991   //printf("invalidate_addr(%x)\n",addr);
992   //invalidate_block(addr>>12);
993   invalidate_blocks(addr>>12,addr>>12);
994   assert(!((cached_code_words[index>>5]>>((index>>2)&7))&1));
995 
996   // Keep track of recent writes that invalidated the cache, so we don't
997   // attempt constant propagation in areas that are frequently written
998   recent_writes[recent_write_index]=addr;
999   recent_write_index=(recent_write_index+1)&7;
1000 }
1001 // This is called when loading a save state.
1002 // Anything could have changed, so invalidate everything.
invalidate_all_pages()1003 void invalidate_all_pages()
1004 {
1005   u32 page;
1006   for(page=0;page<2048;page++)
1007     invalidate_page(page);
1008   for(page=0;page<256;page++) {
1009     if(cached_code[page]) {
1010       restore_candidate[page]|=cached_code[page]; // LowWram/bios
1011     }
1012     if(cached_code[3072+page]) {
1013       restore_candidate[page+256]|=cached_code[3072+page]; // HighWram
1014     }
1015   }
1016   memset(cached_code_words,0,262144);
1017   #ifdef __arm__
1018   __clear_cache((void *)BASE_ADDR,(void *)BASE_ADDR+(1<<TARGET_SIZE_2));
1019   #endif
1020   #ifdef USE_MINI_HT
1021   memset(mini_ht_master,-1,sizeof(mini_ht_master));
1022   memset(mini_ht_slave,-1,sizeof(mini_ht_slave));
1023   #endif
1024 }
1025 
1026 // Add an entry to jump_out after making a link
add_link(u32 vaddr,void * src)1027 void add_link(u32 vaddr,void *src)
1028 {
1029   u32 page=(vaddr&0xDFFFFFFF)>>12;
1030   if(page>1024) page=1024+(page&1023);
1031   inv_debug("add_link: %x -> %x (%d)\n",(int)src,vaddr,page);
1032   ll_add(jump_out+page,vaddr,src);
1033   //int ptr=get_pointer(src);
1034   //inv_debug("add_link: Pointer is to %x\n",(int)ptr);
1035 }
1036 
1037 // If a code block was found to be unmodified (bit was set in
1038 // restore_candidate) and it remains unmodified (bit is set
1039 // in cached_code) then move the entries for that 4K page from
1040 // the dirty list to the clean list.
clean_blocks(u32 page)1041 void clean_blocks(u32 page)
1042 {
1043   struct ll_entry *head;
1044   inv_debug("INV: clean_blocks page=%d\n",page);
1045   head=jump_dirty[page];
1046   while(head!=NULL) {
1047     if((cached_code[head->vaddr>>15]>>((head->vaddr>>12)&7))&1) {;
1048       // Don't restore blocks which are about to expire from the cache
1049       if((((u32)head->addr-(u32)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1050         u32 start,end,vstart=0,vend;
1051         if(verify_dirty((int)head->addr)) {
1052           //printf("Possibly Restore %x (%x)\n",head->vaddr, (int)head->addr);
1053           u32 i;
1054           u32 inv=0;
1055           get_bounds((pointer)head->addr,&start,&end);
1056           if(start-(u32)HighWram<0x100000) {
1057             vstart=start-(u32)HighWram+0x6000000;
1058             vend=end-(u32)HighWram+0x6000000;
1059             for(i=(start-(u32)HighWram+0x6000000)>>12;i<=(end-1-(u32)HighWram+0x6000000)>>12;i++) {
1060               // Check that all the pages are write-protected
1061               if(!((cached_code[i>>3]>>(i&7))&1)) inv=1;
1062             }
1063           }
1064           if(start-(u32)LowWram<0x100000) {
1065             vstart=start-(u32)LowWram+0x200000;
1066             vend=end-(u32)LowWram+0x200000;
1067             for(i=(start-(u32)LowWram+0x200000)>>12;i<=(end-1-(u32)LowWram+0x200000)>>12;i++) {
1068               // Check that all the pages are write-protected
1069               if(!((cached_code[i>>3]>>(i&7))&1)) inv=1;
1070             }
1071           }
1072           // Don't restore stuff that recently got hit, it will probably get hit again
1073           if(vstart) for(i=0;i<8;i++) {
1074             if(recent_writes[i]>=vstart&&recent_writes[i]<vend) {
1075               //printf("recent write: %x\n",recent_writes[i]);
1076               inv=1;
1077             }
1078           }
1079           if(!inv) {
1080             void * clean_addr=(void *)get_clean_addr((int)head->addr);
1081             if((((u32)clean_addr-(u32)out)<<(32-TARGET_SIZE_2))>0x60000000+(MAX_OUTPUT_BLOCK_SIZE<<(32-TARGET_SIZE_2))) {
1082               u32 *ht_bin;
1083               inv_debug("INV: Restored %x (%x/%x)\n",head->vaddr, (int)head->addr, (int)clean_addr);
1084               //printf("page=%x, addr=%x\n",page,head->vaddr);
1085               //assert(head->vaddr>>12==(page|0x80000));
1086               ll_add_nodup(jump_in+page,head->vaddr,clean_addr);
1087               ht_bin=hash_table[((head->vaddr>>16)^head->vaddr)&0xFFFF];
1088               if(ht_bin[0]==head->vaddr) {
1089                 ht_bin[1]=(int)clean_addr; // Replace existing entry
1090               }
1091               if(ht_bin[2]==head->vaddr) {
1092                 ht_bin[3]=(int)clean_addr; // Replace existing entry
1093               }
1094             }
1095             if(vstart) {
1096               //printf("start=%x, end=%x\n",vstart,vend);
1097               for(i=0;i<vend-vstart;i+=4) {
1098                 cached_code_words[((vstart<4194304?vstart:((vstart|0x400000)&0x7fffff))+i)>>5]|=1<<(((vstart+i)>>2)&7);
1099               }
1100             }
1101           }
1102         }
1103       }
1104     }
1105     head=head->next;
1106   }
1107 }
1108 
1109 
do_consts(int i,u32 * isconst,u32 * constmap)1110 void do_consts(int i,u32 *isconst,u32 *constmap)
1111 {
1112   switch(itype[i]) {
1113     case LOAD:
1114       sh2_clear_const(isconst,constmap,rt1[i]);
1115       if(addrmode[i]==POSTINC) {
1116         int size=(opcode[i]==4)?2:(opcode2[i]&3);
1117         constmap[rt2[i]]+=1<<size;
1118       }
1119       break;
1120     case STORE:
1121       if(addrmode[i]==PREDEC) {
1122         int size=(opcode[i]==4)?2:(opcode2[i]&3);
1123         constmap[rt1[i]]-=1<<size;
1124       }
1125       break;
1126     case RMW:
1127       break;
1128     case PCREL:
1129       if(opcode[i]==12) sh2_set_const(isconst,constmap,rt1[i],((start+i*2+4)&~3)+imm[i]); // MOVA
1130       else { // PC-relative load (constant pool)
1131         u32 addr=((start+i*2+4)&~3)+imm[i];
1132         if((u32)((addr-start)>>1)<slen) {
1133           int value;
1134           if(opcode[i]==9) value=(s16)source[((start+i*2+4)+imm[i]-start)>>1]; // MOV.W
1135           else value=(source[(((start+i*2+4)&~3)+imm[i]-start)>>1]<<16)+source[(((start+i*2+4)&~3)+imm[i]+2-start)>>1]; // MOV.L
1136           sh2_set_const(isconst,constmap,rt1[i],value);
1137         }
1138         else sh2_clear_const(isconst,constmap,rt1[i]);
1139       }
1140       break;
1141     case MOV:
1142       if(((*isconst)>>rs1[i])&1) {
1143         int v=constmap[rs1[i]];
1144         sh2_set_const(isconst,constmap,rt1[i],v);
1145       }
1146       else sh2_clear_const(isconst,constmap,rt1[i]);
1147       break;
1148     case IMM8:
1149       if(opcode[i]==0x7) { // ADD
1150         if(((*isconst)>>rs1[i])&1) {
1151           int v=constmap[rs1[i]];
1152           sh2_set_const(isconst,constmap,rt1[i],v+imm[i]);
1153         }
1154         else sh2_clear_const(isconst,constmap,rt1[i]);
1155       }
1156       else if(opcode[i]==0x8) { // CMP/EQ
1157       }
1158       else if(opcode[i]==12) {
1159         if(opcode2[i]==8) { // TST
1160         }else
1161         // AND/XOR/OR
1162         if(((*isconst)>>rs1[i])&1) {
1163           int v=constmap[rs1[i]];
1164           if(opcode2[i]==0x09) sh2_set_const(isconst,constmap,rt1[i],v&imm[i]);
1165           if(opcode2[i]==0x0a) sh2_set_const(isconst,constmap,rt1[i],v^imm[i]);
1166           if(opcode2[i]==0x0b) sh2_set_const(isconst,constmap,rt1[i],v|imm[i]);
1167         }
1168         else sh2_clear_const(isconst,constmap,rt1[i]);
1169       }
1170       else { // opcode[i]==0xE
1171         assert(opcode[i]==0xE);
1172         sh2_set_const(isconst,constmap,rt1[i],imm[i]); // MOV
1173       }
1174       break;
1175     case FLAGS:
1176       if(opcode2[i]==9) { // MOVT
1177         sh2_clear_const(isconst,constmap,rt1[i]);
1178       }
1179       break;
1180     case ALU:
1181       sh2_clear_const(isconst,constmap,rt1[i]);
1182       break;
1183     case EXT:
1184       sh2_clear_const(isconst,constmap,rt1[i]);
1185       break;
1186     case MULTDIV:
1187       if(opcode[i]==0) {
1188         if(opcode2[i]==7) // MUL.L
1189         {
1190           sh2_clear_const(isconst,constmap,MACL);
1191         }
1192         if(opcode2[i]==8) // CLRMAC
1193         {
1194           sh2_clear_const(isconst,constmap,MACH);
1195           sh2_clear_const(isconst,constmap,MACL);
1196         }
1197         if(opcode2[i]==9) // DIV0U
1198         {
1199         }
1200       }
1201       if(opcode[i]==2) {
1202         if(opcode2[i]==7) // DIV0S
1203         {
1204         }
1205         if(opcode2[i]==14||opcode2[i]==15) // MULU.W / MULS.W
1206         {
1207           sh2_clear_const(isconst,constmap,MACL);
1208         }
1209       }
1210       if(opcode[i]==3) {
1211         // DMULU.L / DMULS.L
1212         sh2_clear_const(isconst,constmap,MACH);
1213         sh2_clear_const(isconst,constmap,MACL);
1214       }
1215       break;
1216     case SHIFTIMM:
1217       sh2_clear_const(isconst,constmap,rt1[i]);
1218       break;
1219     case UJUMP:
1220     case RJUMP:
1221     case SJUMP:
1222     case CJUMP:
1223       break;
1224     case SYSTEM:
1225       *isconst=0;
1226       break;
1227     case COMPLEX:
1228       *isconst=0;
1229       break;
1230   }
1231 }
1232 
mov_alloc(struct regstat * current,int i)1233 void mov_alloc(struct regstat *current,int i)
1234 {
1235   // Note: Don't need to actually alloc the source registers
1236   // TODO: Constant propagation
1237   //alloc_reg(current,i,rs1[i]);
1238   alloc_reg(current,i,rt1[i]);
1239   clear_const(current,rs1[i]);
1240   clear_const(current,rt1[i]);
1241   dirty_reg(current,rt1[i]);
1242 }
1243 
shiftimm_alloc(struct regstat * current,int i)1244 void shiftimm_alloc(struct regstat *current,int i)
1245 {
1246   clear_const(current,rs1[i]);
1247   clear_const(current,rt1[i]);
1248   alloc_reg(current,i,rs1[i]);
1249   alloc_reg(current,i,rt1[i]);
1250   dirty_reg(current,rt1[i]);
1251   if(opcode[i]==4) {
1252     if(opcode2[i]<6) { // SHLL/SHAL/SHLR/SHAR/ROTL/ROTCL/ROTR/ROTCR
1253       if(opcode2[i]<4||opcode3[i]<2) {
1254         // SHL/SHA/ROT don't need T bit as a source, only a destination
1255         if(!(current->u&(1LL<<TBIT))) {
1256           alloc_reg(current,i,SR);
1257           dirty_reg(current,SR);
1258         }
1259       }
1260       else {
1261         alloc_reg(current,i,SR); // ROTCL/ROTCR always need T bit
1262         dirty_reg(current,SR);
1263       }
1264     }
1265   }
1266   if(opcode[i]==2&opcode2[i]==13) { // XTRCT
1267     clear_const(current,rs2[i]);
1268     alloc_reg(current,i,rs2[i]);
1269   }
1270 }
1271 
alu_alloc(struct regstat * current,int i)1272 void alu_alloc(struct regstat *current,int i)
1273 {
1274   if(opcode[i]==2) {
1275     alloc_reg(current,i,rs1[i]);
1276     alloc_reg(current,i,rs2[i]);
1277     clear_const(current,rs2[i]);
1278     if(opcode2[i]>8&&opcode2[i]<=11) { // AND/XOR/OR
1279       alloc_reg(current,i,rt1[i]);
1280     }
1281     else  // TST or CMP/STR
1282     {
1283       alloc_reg(current,i,SR); // Liveness analysis on TBIT?
1284       dirty_reg(current,SR);
1285       //#ifdef __x86__ ?
1286       //#ifdef NEEDS_TEMP
1287       if(opcode2[i]==8) { // TST
1288         alloc_reg_temp(current,i,-1);
1289         minimum_free_regs[i]=1;
1290       }
1291       if(opcode2[i]==12) { // CMP/STR
1292         alloc_reg_temp(current,i,-1);
1293         minimum_free_regs[i]=1;
1294       }
1295     }
1296   }
1297   if(opcode[i]==3) {
1298     alloc_reg(current,i,rs1[i]);
1299     alloc_reg(current,i,rs2[i]);
1300     clear_const(current,rs2[i]);
1301     if(opcode2[i]<8) { // CMP intructions
1302       alloc_reg(current,i,SR); // Liveness analysis on TBIT?
1303       dirty_reg(current,SR);
1304       alloc_reg_temp(current,i,-1);
1305       minimum_free_regs[i]=1;
1306     }else{ // ADD/SUB
1307       alloc_reg(current,i,rt1[i]);
1308       if(opcode2[i]&3) {
1309         alloc_reg(current,i,SR);
1310         dirty_reg(current,SR);
1311         //#ifdef NEEDS_TEMP
1312         if((opcode2[i]&3)==3) {
1313           // Need a temporary register for ADDV/SUBV on x86
1314           alloc_reg_temp(current,i,-1);
1315           minimum_free_regs[i]=1;
1316         }
1317       }
1318     }
1319   }
1320   if(opcode[i]==4) { // DT/CMPPZ/CMPPL
1321     // Single operand forms
1322     alloc_reg(current,i,rs1[i]);
1323     if(opcode2[i]==0) dirty_reg(current,rt1[i]); // DT
1324     alloc_reg(current,i,SR); // Liveness analysis on TBIT?
1325     dirty_reg(current,SR);
1326     if(opcode2[i]>0) {
1327       alloc_reg_temp(current,i,-1);
1328       minimum_free_regs[i]=1;
1329     }
1330   }
1331   if(opcode[i]==6) { // NOT/NEG/NEGC
1332     if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1333     alloc_reg(current,i,rt1[i]);
1334     if(opcode2[i]==8||opcode2[i]==9) { // SWAP needs temp (?)
1335       alloc_reg_temp(current,i,-1);
1336       minimum_free_regs[i]=1;
1337     }
1338     if(opcode2[i]==10) {
1339       // NEGC sets T bit
1340       alloc_reg(current,i,SR); // Liveness analysis on TBIT?
1341       dirty_reg(current,SR);
1342     }
1343   }
1344   clear_const(current,rs1[i]);
1345   clear_const(current,rt1[i]);
1346   dirty_reg(current,rt1[i]);
1347 }
1348 
imm8_alloc(struct regstat * current,int i)1349 void imm8_alloc(struct regstat *current,int i)
1350 {
1351   //if(rs1[i]>=0&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1352   //else lt1[i]=rs1[i];
1353   alloc_reg(current,i,rs1[i]);
1354   if(rt1[i]>=0&&rt1[i]!=TBIT) alloc_reg(current,i,rt1[i]);
1355   if(opcode[i]==0x7) { // ADD
1356     if(is_const(current,rs1[i])) {
1357       int v=get_const(current,rs1[i]);
1358       set_const(current,rt1[i],v+imm[i]);
1359     }
1360     else clear_const(current,rt1[i]);
1361   }
1362   else if(opcode[i]==0x8) { // CMP/EQ
1363     alloc_reg(current,i,SR); // Liveness analysis on TBIT?
1364     dirty_reg(current,SR);
1365     alloc_reg_temp(current,i,-1);
1366     minimum_free_regs[i]=1;
1367   }
1368   else if(opcode[i]==12) {
1369     if(opcode2[i]==8) { // TST
1370       alloc_reg(current,i,SR); // Liveness analysis on TBIT?
1371       dirty_reg(current,SR);
1372       alloc_reg_temp(current,i,-1);
1373       minimum_free_regs[i]=1;
1374     }else
1375     // AND/XOR/OR
1376     if(is_const(current,rs1[i])) {
1377       int v=get_const(current,rs1[i]);
1378       if(opcode2[i]==0x09) set_const(current,rt1[i],v&imm[i]);
1379       if(opcode2[i]==0x0a) set_const(current,rt1[i],v^imm[i]);
1380       if(opcode2[i]==0x0b) set_const(current,rt1[i],v|imm[i]);
1381     }
1382     else clear_const(current,rt1[i]);
1383   }
1384   else { // opcode[i]==0xE
1385     assert(opcode[i]==0xE);
1386     set_const(current,rt1[i],imm[i]); // MOV
1387   }
1388   if(rt1[i]>=0&&rt1[i]!=TBIT) dirty_reg(current,rt1[i]);
1389 }
1390 
ext_alloc(struct regstat * current,int i)1391 void ext_alloc(struct regstat *current,int i)
1392 {
1393   // Note: Don't need to actually alloc the source registers
1394   // FIXME: Constant propagation
1395   //alloc_reg(current,i,rs1[i]);
1396   alloc_reg(current,i,rt1[i]);
1397   clear_const(current,rs1[i]);
1398   clear_const(current,rt1[i]);
1399   dirty_reg(current,rt1[i]);
1400 }
1401 
flags_alloc(struct regstat * current,int i)1402 void flags_alloc(struct regstat *current,int i)
1403 {
1404   if(opcode2[i]==8) { // CLRT/SETT
1405     alloc_reg(current,i,SR);
1406     dirty_reg(current,SR);
1407   }else
1408   if(opcode2[i]==9) { // MOVT
1409     alloc_reg(current,i,SR);
1410     alloc_reg(current,i,rt1[i]);
1411     clear_const(current,rt1[i]);
1412     dirty_reg(current,rt1[i]);
1413   }
1414 }
1415 
load_alloc(struct regstat * current,int i)1416 void load_alloc(struct regstat *current,int i)
1417 {
1418   int hr;
1419   clear_const(current,rt1[i]);
1420   //if(rs1[i]!=rt1[i]&&needed_again(rs1[i],i)) clear_const(current,rs1[i]); // Does this help or hurt?
1421   if(needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1422  // if(rs2[i]>=0) alloc_reg(current,i,rs2[i]);
1423   alloc_reg(current,i,rt1[i]==TBIT?SR:rt1[i]);
1424   if(addrmode[i]==DUALIND||addrmode[i]==GBRIND) {
1425     alloc_reg(current,i,rs1[i]);
1426     alloc_reg(current,i,rs2[i]);
1427     if(!is_const(current,rs1[i])||!is_const(current,rs2[i])) {
1428       // Both must be constants to propagate the sum
1429       clear_const(current,rs1[i]);
1430       clear_const(current,rs2[i]);
1431     }
1432   }
1433   else
1434   if(addrmode[i]==POSTINC) {
1435     if(is_const(current,rt2[i])) {
1436       int v=get_const(current,rt2[i]);
1437       set_const(current,rt2[i],v+(1<<((opcode[i]==4)?2:(opcode2[i]&3))));
1438       // Note: constant is preincremented, address_generation corrects the offset
1439     }
1440     else {
1441       alloc_reg(current,i,rt2[i]);
1442       dirty_reg(current,rt2[i]);
1443     }
1444   }
1445 
1446   // Need a register to load from memory_map
1447   alloc_reg(current,i,MOREG);
1448   if(rt1[i]==TBIT||get_reg(current->regmap,rt1[i])<0) {
1449     // dummy load, but we still need a register to calculate the address
1450     alloc_reg_temp(current,i,-1);
1451     minimum_free_regs[i]=1;
1452   }
1453   if(rt1[i]==TBIT) dirty_reg(current,SR);
1454   else dirty_reg(current,rt1[i]);
1455 
1456   // Make MOREG a temporary, give pass 5 another register to work with
1457   hr=get_reg(current->regmap,MOREG);
1458   assert(hr>=0);
1459   assert(current->regmap[hr]==MOREG);
1460   current->regmap[hr]=-1;
1461   minimum_free_regs[i]++;
1462 }
1463 
store_alloc(struct regstat * current,int i)1464 void store_alloc(struct regstat *current,int i)
1465 {
1466   int hr;
1467   //printf("%x: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",start+i*2,current->regmap[0],current->regmap[1],current->regmap[2],current->regmap[3],current->regmap[5],current->regmap[6],current->regmap[7]);
1468   if(addrmode[i]==DUALIND) {
1469     alloc_reg(current,i,rs2[i]);
1470     alloc_reg(current,i,0); // rs3[i]
1471     if(!is_const(current,rs2[i])||!is_const(current,rs3[i])) {
1472       // Both must be constants to propagate the sum
1473       clear_const(current,rs2[i]);
1474       clear_const(current,rs3[i]);
1475     }
1476   }
1477   if(addrmode[i]==PREDEC) {
1478     if(is_const(current,rt1[i])) {
1479       int v=get_const(current,rt1[i]);
1480       set_const(current,rt1[i],v-(1<<((opcode[i]==4)?2:(opcode2[i]&3))));
1481     }
1482     else {
1483       alloc_reg(current,i,rt1[i]);
1484       dirty_reg(current,rt1[i]);
1485     }
1486   }
1487   if(needed_again(rs2[i],i)) alloc_reg(current,i,rs2[i]);
1488   clear_const(current,rs1[i]);
1489   alloc_reg(current,i,rs1[i]);
1490   // Need a register to load from memory_map
1491   alloc_reg(current,i,MOREG);
1492 
1493   // We need a temporary register for address generation
1494   alloc_reg_temp(current,i,-1);
1495   minimum_free_regs[i]=1;
1496 
1497   // Make MOREG a temporary, give pass 5 another register to work with
1498   hr=get_reg(current->regmap,MOREG);
1499   assert(hr>=0);
1500   assert(current->regmap[hr]==MOREG);
1501   current->regmap[hr]=-1;
1502   minimum_free_regs[i]++;
1503 }
1504 
rmw_alloc(struct regstat * current,int i)1505 void rmw_alloc(struct regstat *current,int i)
1506 {
1507   //printf("%x: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",start+i*2,current->regmap[0],current->regmap[1],current->regmap[2],current->regmap[3],current->regmap[5],current->regmap[6],current->regmap[7]);
1508   if(addrmode[i]==GBRIND) {
1509     alloc_reg(current,i,GBR);
1510     alloc_reg(current,i,0);
1511     if(!is_const(current,rs2[i])||!is_const(current,rs3[i])) {
1512       // Both must be constants to propagate the sum
1513       clear_const(current,rs2[i]);
1514       clear_const(current,rs3[i]);
1515     }
1516   }
1517   if(addrmode[i]==REGIND&&needed_again(rs1[i],i)) alloc_reg(current,i,rs1[i]);
1518   if(rt1[i]==TBIT) {
1519     alloc_reg(current,i,SR);
1520     dirty_reg(current,SR);
1521   }
1522 
1523   // Need a register to load from memory_map
1524   alloc_reg(current,i,MOREG);
1525 
1526   // We need a temporary register for address generation
1527   alloc_reg_temp(current,i,-1);
1528   // And one for the read-modify-write
1529   //alloc_reg_temp(current,i,-2); // Can re-use mapping reg for this
1530   minimum_free_regs[i]=1;
1531 }
1532 
pcrel_alloc(struct regstat * current,int i)1533 void pcrel_alloc(struct regstat *current,int i)
1534 {
1535   u32 addr;
1536   alloc_reg(current,i,rt1[i]);
1537   addr=((start+i*2+4)&~3)+imm[i];
1538   if(opcode[i]==12) { // MOVA, address generation only
1539     set_const(current,rt1[i],addr);
1540   }else if((unsigned)((addr-start)>>1)<slen) {
1541     if(opcode[i]==9) { // MOV.W
1542       addr=(start+i*2+4)+imm[i];
1543       set_const(current,rt1[i],(s16)source[(addr-start)>>1]);
1544     }
1545     else // MOV.L
1546       set_const(current,rt1[i],(source[(addr-start)>>1]<<16)+source[(addr+2-start)>>1]);
1547   }
1548   else {
1549     // Do actual load
1550     //alloc_reg(current,i,MOREG);
1551     clear_const(current,rt1[i]);
1552   }
1553   dirty_reg(current,rt1[i]);
1554 }
1555 
1556 #ifndef multdiv_alloc
multdiv_alloc(struct regstat * current,int i)1557 void multdiv_alloc(struct regstat *current,int i)
1558 {
1559   //printf("%x: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",start+i*2,current->regmap[0],current->regmap[1],current->regmap[2],current->regmap[3],current->regmap[5],current->regmap[6],current->regmap[7]);
1560   if(opcode[i]==0) {
1561     if(opcode2[i]==7) // MUL.L
1562     {
1563       clear_const(current,rs1[i]);
1564       clear_const(current,rs2[i]);
1565       clear_const(current,MACL);
1566       alloc_reg(current,i,rs1[i]);
1567       alloc_reg(current,i,rs2[i]);
1568       alloc_reg(current,i,MACL);
1569       dirty_reg(current,MACL);
1570     }
1571     if(opcode2[i]==8) // CLRMAC
1572     {
1573       clear_const(current,MACH);
1574       clear_const(current,MACL);
1575       alloc_reg(current,i,MACH);
1576       alloc_reg(current,i,MACL);
1577       dirty_reg(current,MACH);
1578       dirty_reg(current,MACL);
1579     }
1580     if(opcode2[i]==9) // DIV0U
1581     {
1582       alloc_reg(current,i,SR);
1583       dirty_reg(current,SR);
1584     }
1585   }
1586   if(opcode[i]==2) {
1587     if(opcode2[i]==7) // DIV0S
1588     {
1589       clear_const(current,rs1[i]); // Is this necessary?
1590       clear_const(current,rs2[i]); // Is this necessary?
1591       alloc_reg(current,i,rs1[i]);
1592       alloc_reg(current,i,rs2[i]);
1593       alloc_reg(current,i,SR);
1594       dirty_reg(current,SR);
1595       #if defined(__i386__) || defined(__x86_64__)
1596       //#ifdef NEEDS_TEMP
1597       alloc_reg_temp(current,i,-1);
1598       minimum_free_regs[i]=1;
1599       #endif
1600     }
1601     if(opcode2[i]==14||opcode2[i]==15) // MULU.W / MULS.W
1602     {
1603       clear_const(current,rs1[i]);
1604       clear_const(current,rs2[i]);
1605       clear_const(current,MACL);
1606       alloc_reg(current,i,rs1[i]);
1607       alloc_reg(current,i,rs2[i]);
1608       alloc_reg(current,i,MACL);
1609       dirty_reg(current,MACL);
1610       //#ifdef NEEDS_TEMP
1611       alloc_reg_temp(current,i,-1);
1612       minimum_free_regs[i]=1;
1613     }
1614   }
1615   if(opcode[i]==3) {
1616     // DMULU.L / DMULS.L
1617     #if defined(__i386__) || defined(__x86_64__)
1618     if(!(current->u&(1LL<<MACH))) {
1619       alloc_x86_reg(current,i,MACH,EDX); // Don't need to alloc MACH if it's unneeded
1620       current->u&=~(1LL<<MACL); // But if it is, then assume MACL is needed since it will be overwritten
1621     }
1622     alloc_x86_reg(current,i,MACL,EAX);
1623     #else
1624     if(!(current->u&(1LL<<MACH))) {
1625       alloc_reg(current,i,MACH);
1626       current->u&=~(1LL<<MACL);
1627     }
1628     alloc_reg(current,i,MACL);
1629     #endif
1630     clear_const(current,rs1[i]);
1631     clear_const(current,rs2[i]);
1632     clear_const(current,MACH);
1633     clear_const(current,MACL);
1634     alloc_reg(current,i,rs1[i]);
1635     alloc_reg(current,i,rs2[i]);
1636     dirty_reg(current,MACH);
1637     dirty_reg(current,MACL);
1638   }
1639 }
1640 #endif
1641 
complex_alloc(struct regstat * current,int i)1642 void complex_alloc(struct regstat *current,int i)
1643 {
1644   if(opcode[i]==3&&opcode2[i]==4) { // DIV1
1645     #if defined(__i386__) || defined(__x86_64__)
1646     alloc_x86_reg(current,i,rs1[i],ECX);
1647     alloc_x86_reg(current,i,rs2[i],EAX);
1648     alloc_x86_reg(current,i,SR,EDX);
1649     alloc_all(current,i);
1650     #else
1651     #if defined(__arm__)
1652     alloc_arm_reg(current,i,rs1[i],1);
1653     alloc_arm_reg(current,i,rs2[i],0);
1654     alloc_arm_reg(current,i,SR,2);
1655     alloc_all(current,i);
1656     #else
1657     // FIXME
1658     assert(0);
1659     #endif
1660     #endif
1661     dirty_reg(current,rs2[i]);
1662     dirty_reg(current,SR);
1663   }
1664   if(opcode[i]==0&&opcode2[i]==15) { // MAC.L
1665     #if defined(__i386__) || defined(__x86_64__)
1666     alloc_x86_reg(current,i,rs1[i],EBP);
1667     alloc_x86_reg(current,i,rs2[i],EDI);
1668     alloc_x86_reg(current,i,SR,EBX);
1669     alloc_all(current,i);
1670     alloc_x86_reg(current,i,MACL,EAX);
1671     alloc_x86_reg(current,i,MACH,EDX);
1672     #else
1673     #if defined(__arm__)
1674     alloc_arm_reg(current,i,rs1[i],5);
1675     alloc_arm_reg(current,i,rs2[i],6);
1676     alloc_arm_reg(current,i,SR,4);
1677     alloc_all(current,i);
1678     alloc_arm_reg(current,i,MACL,0);
1679     alloc_arm_reg(current,i,MACH,1);
1680     #else
1681     // FIXME
1682     assert(0);
1683     #endif
1684     #endif
1685     dirty_reg(current,rs1[i]);
1686     dirty_reg(current,rs2[i]);
1687     dirty_reg(current,MACH);
1688     dirty_reg(current,MACL);
1689     clear_const(current,MACH);
1690     clear_const(current,MACL);
1691   }
1692   if(opcode[i]==4&&opcode2[i]==15) { // MAC.W
1693     #if defined(__i386__) || defined(__x86_64__)
1694     alloc_x86_reg(current,i,rs1[i],EBP);
1695     alloc_x86_reg(current,i,rs2[i],EDI);
1696     alloc_x86_reg(current,i,SR,EBX);
1697     alloc_all(current,i);
1698     alloc_x86_reg(current,i,MACL,EAX);
1699     alloc_x86_reg(current,i,MACH,EDX);
1700     #else
1701     #if defined(__arm__)
1702     alloc_arm_reg(current,i,rs1[i],5);
1703     alloc_arm_reg(current,i,rs2[i],6);
1704     alloc_arm_reg(current,i,SR,4);
1705     alloc_all(current,i);
1706     alloc_arm_reg(current,i,MACL,0);
1707     alloc_arm_reg(current,i,MACH,1);
1708     #else
1709     // FIXME
1710     assert(0);
1711     #endif
1712     #endif
1713     dirty_reg(current,rs1[i]);
1714     dirty_reg(current,rs2[i]);
1715     dirty_reg(current,MACH);
1716     dirty_reg(current,MACL);
1717     clear_const(current,MACH);
1718     clear_const(current,MACL);
1719   }
1720   clear_const(current,rs1[i]);
1721   clear_const(current,rs2[i]);
1722   minimum_free_regs[i]=HOST_REGS;
1723 }
1724 
system_alloc(struct regstat * current,int i)1725 void system_alloc(struct regstat *current,int i)
1726 {
1727   alloc_cc(current,i);
1728   dirty_reg(current,CCREG);
1729   if(opcode[i]==12) { // TRAPA
1730     alloc_reg(current,i,15); // Stack reg
1731     dirty_reg(current,15);
1732     alloc_reg(current,i,SR); // Status/flags
1733     alloc_reg(current,i,VBR);
1734     alloc_reg(current,i,MOREG); // memory_map offset
1735     alloc_reg_temp(current,i,-1);
1736     minimum_free_regs[i]=1;
1737   }
1738   current->isdoingcp=0;
1739 }
1740 
delayslot_alloc(struct regstat * current,int i)1741 void delayslot_alloc(struct regstat *current,int i)
1742 {
1743   switch(itype[i]) {
1744     case UJUMP:
1745     case CJUMP:
1746     case SJUMP:
1747     case RJUMP:
1748     case SYSCALL:
1749       assem_debug("jump in the delay slot.  this shouldn't happen.\n");//exit(1);
1750       printf("Disabled speculative precompilation\n");
1751       stop_after_jal=1;
1752       break;
1753     case IMM8:
1754       imm8_alloc(current,i);
1755       break;
1756     case LOAD:
1757       load_alloc(current,i);
1758       break;
1759     case STORE:
1760       store_alloc(current,i);
1761       break;
1762     case RMW:
1763       rmw_alloc(current,i);
1764       break;
1765     case PCREL:
1766       pcrel_alloc(current,i);
1767       break;
1768     case ALU:
1769       alu_alloc(current,i);
1770       break;
1771     case MULTDIV:
1772       multdiv_alloc(current,i);
1773       break;
1774     case SHIFTIMM:
1775       shiftimm_alloc(current,i);
1776       break;
1777     case MOV:
1778       mov_alloc(current,i);
1779       break;
1780     case EXT:
1781       ext_alloc(current,i);
1782       break;
1783     case FLAGS:
1784       flags_alloc(current,i);
1785       break;
1786     case COMPLEX:
1787       complex_alloc(current,i);
1788       break;
1789   }
1790 }
1791 
add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)1792 void add_stub(int type,int addr,int retaddr,int a,int b,int c,int d,int e)
1793 {
1794   stubs[stubcount][0]=type;
1795   stubs[stubcount][1]=addr;
1796   stubs[stubcount][2]=retaddr;
1797   stubs[stubcount][3]=a;
1798   stubs[stubcount][4]=b;
1799   stubs[stubcount][5]=c;
1800   stubs[stubcount][6]=d;
1801   stubs[stubcount][7]=e;
1802   stubcount++;
1803 }
1804 
1805 // Write out a single register
wb_register(signed char r,signed char regmap[],u32 dirty)1806 void wb_register(signed char r,signed char regmap[],u32 dirty)
1807 {
1808   int hr;
1809   for(hr=0;hr<HOST_REGS;hr++) {
1810     if(hr!=EXCLUDE_REG) {
1811       if((regmap[hr]&63)==r) {
1812         if((dirty>>hr)&1) {
1813           emit_storereg(r,hr);
1814         }
1815       }
1816     }
1817   }
1818 }
1819 
1820 /*int mchecksum()
1821 {
1822   //if(!tracedebug) return 0;
1823   int i;
1824   int sum=0;
1825   for(i=0;i<2097152;i++) {
1826     unsigned int temp=sum;
1827     sum<<=1;
1828     sum|=(~temp)>>31;
1829     sum^=((u_int *)rdram)[i];
1830   }
1831   return sum;
1832 }
1833 int rchecksum()
1834 {
1835   int i;
1836   int sum=0;
1837   for(i=0;i<64;i++)
1838     sum^=((u_int *)reg)[i];
1839   return sum;
1840 }
1841 int fchecksum()
1842 {
1843   int i;
1844   int sum=0;
1845   for(i=0;i<64;i++)
1846     sum^=((u_int *)reg_cop1_fgr_64)[i];
1847   return sum;
1848 }
1849 void rlist()
1850 {
1851   int i;
1852   printf("TRACE: ");
1853   for(i=0;i<32;i++)
1854     printf("r%d:%8x%8x ",i,((int *)(reg+i))[1],((int *)(reg+i))[0]);
1855   printf("\n");
1856   //printf("TRACE: ");
1857   //for(i=0;i<32;i++)
1858   //  printf("f%d:%8x%8x ",i,((int*)reg_cop1_simple[i])[1],*((int*)reg_cop1_simple[i]));
1859   //printf("\n");
1860 }*/
1861 
enabletrace()1862 void enabletrace()
1863 {
1864   tracedebug=1;
1865 }
1866 
1867 #if 0
1868 void memdebug(int i)
1869 {
1870   //printf("TRACE: count=%d next=%d (checksum %x) lo=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[LOREG]>>32),(int)reg[LOREG]);
1871   //printf("TRACE: count=%d next=%d (rchecksum %x)\n",Count,next_interupt,rchecksum());
1872   //rlist();
1873   //if(tracedebug) {
1874   //if(Count>=-2084597794) {
1875   //if((signed int)Count>=-2084597794&&(signed int)Count<0) {
1876   //if(0) {
1877     printf("TRACE: (checksum %x)\n",mchecksum());
1878     //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
1879     //printf("TRACE: count=%d next=%d (checksum %x) Status=%x\n",Count,next_interupt,mchecksum(),Status);
1880     //printf("TRACE: count=%d next=%d (checksum %x) hi=%8x%8x\n",Count,next_interupt,mchecksum(),(int)(reg[HIREG]>>32),(int)reg[HIREG]);
1881     //rlist();
1882     #ifdef __i386__
1883     printf("TRACE: %x\n",(&i)[-1]);
1884     #endif
1885     #ifdef __arm__
1886     int j;
1887     printf("TRACE: %x \n",(&j)[10]);
1888     printf("TRACE: %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",(&j)[1],(&j)[2],(&j)[3],(&j)[4],(&j)[5],(&j)[6],(&j)[7],(&j)[8],(&j)[9],(&j)[10],(&j)[11],(&j)[12],(&j)[13],(&j)[14],(&j)[15],(&j)[16],(&j)[17],(&j)[18],(&j)[19],(&j)[20]);
1889     #endif
1890     //fflush(stdout);
1891   //}
1892   //printf("TRACE: %x\n",(&i)[-1]);
1893 }
1894 #endif
1895 
alu_assemble(int i,struct regstat * i_regs)1896 void alu_assemble(int i,struct regstat *i_regs)
1897 {
1898   if(opcode[i]==2) {
1899     if(opcode2[i]>=9&&opcode2[i]<=11) { // AND/XOR/OR
1900       signed char s,t;
1901       s=get_reg(i_regs->regmap,rs1[i]);
1902       t=get_reg(i_regs->regmap,rt1[i]);
1903       //assert(s>=0);
1904       if(t>=0) {
1905         if(opcode2[i]==9) emit_and(s,t,t);
1906         if(opcode2[i]==10) emit_xor(rs1[i]>=0?s:t,t,t);
1907         if(opcode2[i]==11) emit_or(s,t,t);
1908       }
1909     }
1910     else
1911     {
1912       signed char s1,s2,sr,temp;
1913       s1=get_reg(i_regs->regmap,rs1[i]);
1914       s2=get_reg(i_regs->regmap,rs2[i]);
1915       sr=get_reg(i_regs->regmap,SR);
1916       temp=get_reg(i_regs->regmap,-1);
1917       assert(s1>=0);
1918       assert(s2>=0);
1919       assert(sr>=0);
1920       assert(temp>=0); // Not needed for TST on ARM?
1921       if(opcode2[i]==8) { // TST
1922         emit_sh2tst(s1,s2,sr,temp);
1923       }
1924       else if(opcode2[i]==12) { // CMP/STR
1925         emit_cmpstr(s1,s2,sr,temp);
1926       }
1927     }
1928   }
1929   if(opcode[i]==3) { // ADD/SUB
1930     if(opcode2[i]<8) { // CMP
1931       signed char s1,s2,sr,temp;
1932       s1=get_reg(i_regs->regmap,rs1[i]);
1933       s2=get_reg(i_regs->regmap,rs2[i]);
1934       sr=get_reg(i_regs->regmap,SR);
1935       temp=get_reg(i_regs->regmap,-1);
1936       assert(s1>=0);
1937       assert(s2>=0);
1938       assert(temp>=0);
1939       if(opcode2[i]==0) emit_cmpeq(s1,s2,sr,temp);
1940       if(opcode2[i]==2) emit_cmphs(s1,s2,sr,temp);
1941       if(opcode2[i]==3) emit_cmpge(s1,s2,sr,temp);
1942       if(opcode2[i]==6) emit_cmphi(s1,s2,sr,temp);
1943       if(opcode2[i]==7) emit_cmpgt(s1,s2,sr,temp);
1944     }
1945     else
1946     {
1947       signed char s,t,sr,temp;
1948       t=get_reg(i_regs->regmap,rt1[i]);
1949       if(t>=0) {
1950         s=get_reg(i_regs->regmap,rs1[i]);
1951         sr=get_reg(i_regs->regmap,SR);
1952         temp=get_reg(i_regs->regmap,-1);
1953         assert(s>=0);
1954         //assert(s2==t);
1955         if(opcode2[i]==8) emit_sub(t,s,t);
1956         if(opcode2[i]==10) emit_subc(s,t,sr);
1957         //if(opcode2[i]==11) emit_subv(s,sr,temp);
1958         assert(opcode2[i]!=11);
1959         if(opcode2[i]==12) emit_add(s,t,t);
1960         if(opcode2[i]==14) emit_addc(s,t,sr);
1961         //if(opcode2[i]==15) emit_addv(s,sr,temp);
1962         assert(opcode2[i]!=15);
1963       }
1964     }
1965   }
1966   if(opcode[i]==4) { // DT/CMPPZ/CMPPL
1967     signed char s,t,sr,temp;
1968     s=get_reg(i_regs->regmap,rs1[i]);
1969     sr=get_reg(i_regs->regmap,SR);
1970     assert(s>=0);
1971     assert(sr>=0);
1972     if(opcode2[i]==0) {
1973       t=get_reg(i_regs->regmap,rt1[i]);
1974       assert(t>=0); // FIXME - Liveness analysis
1975       assert(s==t);
1976       emit_dt(s,sr);
1977     }
1978     else if(opcode2[i]==1) emit_cmppz(s,sr);
1979     else if(opcode2[i]==5)
1980     {
1981       temp=get_reg(i_regs->regmap,-1);
1982       emit_cmppl(s,sr,temp);
1983     }
1984   }
1985   if(opcode[i]==6) { // NOT/SWAP/NEG
1986     int s=get_reg(i_regs->regmap,rs1[i]);
1987     int t=get_reg(i_regs->regmap,rt1[i]);
1988     if(s<0) {
1989       // FIXME: Preload?
1990       emit_loadreg(rs1[i],t);
1991       s=t;
1992     }
1993     if(t>=0) {
1994       if(opcode2[i]==7) emit_not(s,t);
1995       if(opcode2[i]==8) emit_swapb(s,t);
1996       if(opcode2[i]==9) emit_rorimm(s,16,t);
1997       if(opcode2[i]==11) emit_neg(s,t);
1998     }
1999     if(opcode2[i]==10) { // NEGC
2000       int sr=get_reg(i_regs->regmap,SR);
2001       if(i_regs->u&(1LL<<rt1[i])) t=-1;
2002       assert(sr>=0);
2003       emit_negc(s,t,sr);
2004     }
2005   }
2006 }
2007 
imm8_assemble(int i,struct regstat * i_regs)2008 void imm8_assemble(int i,struct regstat *i_regs)
2009 {
2010   if(opcode[i]==0x7) { // ADD
2011     signed char s,t;
2012     t=get_reg(i_regs->regmap,rt1[i]);
2013     s=get_reg(i_regs->regmap,rs1[i]);
2014     //assert(t>=0);
2015     assert(s>=0);
2016     if(t>=0) {
2017       if(!((i_regs->isdoingcp>>t)&1)) {
2018         if(s<0) {
2019           if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2020           emit_addimm(t,imm[i],t);
2021         }else{
2022           if(!((i_regs->wasdoingcp>>s)&1))
2023             emit_addimm(s,imm[i],t);
2024           else
2025             emit_movimm(cpmap[i][s]+imm[i],t);
2026         }
2027       }
2028     }
2029   }
2030   else if(opcode[i]==0x8) { // CMP/EQ
2031     signed char s,sr,temp;
2032     s=get_reg(i_regs->regmap,rs1[i]);
2033     sr=get_reg(i_regs->regmap,SR);
2034     temp=get_reg(i_regs->regmap,-1);
2035     assert(s>=0);
2036     assert(sr>=0); // Liveness analysis?
2037     assert(temp>=0);
2038     emit_cmpeqimm(s,imm[i],sr,temp);
2039   }
2040   else if(opcode[i]==12) {
2041     if(opcode2[i]==8) { // TST
2042       signed char s,sr,temp;
2043       s=get_reg(i_regs->regmap,rs1[i]);
2044       sr=get_reg(i_regs->regmap,SR);
2045       temp=get_reg(i_regs->regmap,-1);
2046       assert(s>=0);
2047       assert(sr>=0); // Liveness analysis?
2048       assert(temp>=0);
2049       emit_sh2tstimm(s,imm[i],sr,temp);
2050     }else{
2051       signed char s,t;
2052       t=get_reg(i_regs->regmap,rt1[i]);
2053       s=get_reg(i_regs->regmap,rs1[i]);
2054       if(t>=0 && !((i_regs->isdoingcp>>t)&1)) {
2055         if(opcode2[i]==9) //AND
2056         {
2057           if(s<0) {
2058             if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2059             emit_andimm(t,imm[i],t);
2060           }else{
2061             if(!((i_regs->wasdoingcp>>s)&1))
2062               emit_andimm(s,imm[i],t);
2063             else
2064               emit_movimm(cpmap[i][s]&imm[i],t);
2065           }
2066         }
2067         else
2068         if(opcode2[i]==10) //XOR
2069         {
2070           if(s<0) {
2071             if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2072             emit_xorimm(t,imm[i],t);
2073           }else{
2074             if(!((i_regs->wasdoingcp>>s)&1))
2075               emit_xorimm(s,imm[i],t);
2076             else
2077               emit_movimm(cpmap[i][s]^imm[i],t);
2078           }
2079         }
2080         else
2081         if(opcode2[i]==11) //OR
2082         {
2083           if(s<0) {
2084             if(i_regs->regmap_entry[t]!=rs1[i]) emit_loadreg(rs1[i],t);
2085             emit_orimm(t,imm[i],t);
2086           }else{
2087             if(!((i_regs->wasdoingcp>>s)&1))
2088               emit_orimm(s,imm[i],t);
2089             else
2090               emit_movimm(cpmap[i][s]|imm[i],t);
2091           }
2092         }
2093       }
2094     }
2095   }
2096   else { // opcode[i]==0xE
2097     signed char t;
2098     assert(opcode[i]==0xE);
2099     t=get_reg(i_regs->regmap,rt1[i]);
2100     //assert(t>=0);
2101     if(t>=0) {
2102       if(!((i_regs->isdoingcp>>t)&1))
2103         emit_movimm(imm[i]<<16,t);
2104     }
2105   }
2106 }
2107 
shiftimm_assemble(int i,struct regstat * i_regs)2108 void shiftimm_assemble(int i,struct regstat *i_regs)
2109 {
2110   if(opcode[i]==4) // SHL/SHR
2111   {
2112     if(opcode2[i]<8) {
2113       signed char s,t,sr;
2114       s=get_reg(i_regs->regmap,rs1[i]);
2115       t=get_reg(i_regs->regmap,rt1[i]);
2116       sr=get_reg(i_regs->regmap,SR);
2117       assert(s==t);
2118       if(opcode2[i]==0) // SHLL/SHAL
2119       {
2120         if(i_regs->u&(1LL<<TBIT)) emit_shlimm(s,1,s);
2121         else emit_shlsr(s,sr); // Is there any difference between SHLL and SHAL?
2122       }
2123       else if(opcode2[i]==1) // SHLR/SHAR
2124       {
2125         if(i_regs->u&(1LL<<TBIT)) {
2126           // Skip T bit if unneeded
2127           if(opcode3[i]==0) emit_shrimm(s,1,s);
2128           if(opcode3[i]==2) emit_sarimm(s,1,s);
2129         }else{
2130           // Set T bit
2131           if(opcode3[i]==0) emit_shrsr(s,sr);
2132           if(opcode3[i]==2) emit_sarsr(s,sr);
2133         }
2134       }
2135       else if(opcode2[i]==4) {// ROTL/ROTCL
2136         if(opcode3[i]==0) {
2137           if(i_regs->u&(1LL<<TBIT)) {
2138             emit_rotl(s); // Skip T bit if unneeded
2139           }else{
2140             emit_rotlsr(s,sr);
2141           }
2142         }
2143         if(opcode3[i]==2) emit_rotclsr(s,sr);
2144       }
2145       else {
2146         assert(opcode2[i]==5); // ROTR/ROTCR
2147         if(opcode3[i]==0) {
2148           if(i_regs->u&(1LL<<TBIT)) {
2149             emit_rotr(s); // Skip T bit if unneeded
2150           }else{
2151             emit_rotrsr(s,sr);
2152           }
2153         }
2154         if(opcode3[i]==2) emit_rotcrsr(s,sr);
2155       }
2156     }else{
2157       signed char s,t;
2158       s=get_reg(i_regs->regmap,rs1[i]);
2159       t=get_reg(i_regs->regmap,rt1[i]);
2160       //assert(t>=0);
2161       if(t>=0){
2162         if(opcode2[i]==8) // SHLL
2163         {
2164           if(opcode3[i]==0) emit_shlimm(s,2,t);
2165           if(opcode3[i]==1) emit_shlimm(s,8,t);
2166           if(opcode3[i]==2) emit_shlimm(s,16,t);
2167         }
2168         if(opcode2[i]==9) // SHLR
2169         {
2170           if(opcode3[i]==0) emit_shrimm(s,2,t);
2171           if(opcode3[i]==1) emit_shrimm(s,8,t);
2172           if(opcode3[i]==2) emit_shrimm(s,16,t);
2173         }
2174       }
2175     }
2176   }
2177   else if(opcode[i]==2) // XTRCT
2178   {
2179     signed char s,t,sr;
2180     s=get_reg(i_regs->regmap,rs1[i]);
2181     t=get_reg(i_regs->regmap,rt1[i]);
2182     assert(rs2[i]==rt1[i]);
2183     emit_shrdimm(t,s,16,t);
2184   }
2185 }
2186 
load_assemble(int i,struct regstat * i_regs)2187 void load_assemble(int i,struct regstat *i_regs)
2188 {
2189   int dummy;
2190   int s,o,t,addr,map=-1,cache=-1;
2191   int offset;
2192   int jaddr=0;
2193   int memtarget,c=0;
2194   int dualindex=(addrmode[i]==DUALIND||addrmode[i]==GBRIND);
2195   int size=(opcode[i]==4)?2:(opcode2[i]&3);
2196   unsigned int hr;
2197   u32 reglist=0;
2198   pointer constaddr;
2199   t=get_reg(i_regs->regmap,rt1[i]==TBIT?-1:rt1[i]);
2200   s=get_reg(i_regs->regmap,rs1[i]);
2201   o=get_reg(i_regs->regmap,rs2[i]);
2202   offset=imm[i];
2203   for(hr=0;hr<HOST_REGS;hr++) {
2204     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2205   }
2206   //if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2207   if(s>=0) {
2208     if(dualindex)
2209       c=(i_regs->wasdoingcp>>s)&(i_regs->wasdoingcp>>o)&1;
2210     else
2211       c=(i_regs->wasdoingcp>>s)&1;
2212     if(c) {
2213       if(dualindex)
2214         constaddr=cpmap[i][s]+cpmap[i][o];
2215       else
2216         constaddr=cpmap[i][s]+offset;
2217       //if(dualindex) {
2218       // if((i_regs->isconst>>rs1[i])&(i_regs->isconst>>rs2[i])&1)
2219       //  assert(constaddr==i_regs->constmap[rs1[i]]+i_regs->constmap[rs2[i]]);
2220       //}else
2221       // if((i_regs->isconst>>rs1[i])&1)
2222       //  assert(constaddr==i_regs->constmap[rs1[i]]+offset);
2223       if(addrmode[i]==POSTINC) constaddr-=1<<size;
2224       //printf("constaddr=%x offset=%x\n",constaddr,offset);
2225       memtarget=can_direct_read(constaddr);
2226     }
2227   }
2228   if(t<0) t=get_reg(i_regs->regmap,-1);
2229   if(!c) {
2230     if(dualindex) {
2231       c=(i_regs->isconst>>rs1[i])&(i_regs->isconst>>rs2[i])&1;
2232     } else {
2233       c=(i_regs->isconst>>rs1[i])&1;
2234     }
2235     if(c) {
2236       if(dualindex)
2237         constaddr=i_regs->constmap[rs1[i]]+i_regs->constmap[rs2[i]];
2238       else
2239         constaddr=i_regs->constmap[rs1[i]]+offset;
2240       if(addrmode[i]==POSTINC) constaddr-=1<<size;
2241       //printf("constaddr=%x offset=%x\n",constaddr,offset);
2242       memtarget=can_direct_read(constaddr);
2243       #ifndef HOST_IMM_ADDR32
2244       // In this case, the constant is not already loaded into a register
2245       if(can_direct_read(constaddr))
2246         emit_movimm(map_address(constaddr^(!size)),t);
2247       #endif
2248     }
2249   }
2250   if(offset||dualindex||s<0||c) addr=t;
2251   else addr=s;
2252   //printf("load_assemble: c=%d\n",c);
2253   //if(c) printf("load_assemble: const=%x\n",(int)constaddr);
2254   assert(t>=0); // Even if the load is a NOP, we must check for I/O
2255   reglist&=~(1<<t);
2256   if(!c)
2257   {
2258     int x=0;
2259     if (!c&&size==0) x=1; // MOV.B
2260     cache=get_reg(i_regs->regmap,MMREG);
2261     map=get_reg(i_regs->regmap,MOREG);
2262     if(map<0) map=get_alt_reg(i_regs->regmap,-1);
2263     assert(map>=0);
2264     assert(map!=s);
2265     assert(map!=t);
2266     reglist&=~(1<<map);
2267     map=do_map_r(addr,t,map,cache,x,-1,-1,c,constaddr);
2268     if (!c&&size==0) addr=t; // MOV.B
2269     do_map_r_branch(map,c,constaddr,&jaddr);
2270     //jaddr=(int)out;emit_jmp(0); // for debugging
2271   }
2272   else
2273   {
2274     if(can_direct_read(constaddr)) constaddr=map_address(constaddr);
2275   }
2276   dummy=(t!=get_reg(i_regs->regmap,rt1[i])); // ignore loads to unneeded reg
2277   if(opcode[i]==12&&opcode2[i]==12) // TST.B
2278     dummy=i_regs->u&(1LL<<TBIT);
2279   if (size==0) { // MOV.B
2280     if(!c||memtarget) {
2281       if(!dummy) {
2282         #ifdef HOST_IMM_ADDR32
2283         if(c)
2284           emit_movsbl(constaddr^1,t);
2285         else
2286         #endif
2287         {
2288           int x=0;
2289           emit_movsbl_indexed_map(x,t,map,t);
2290         }
2291       }
2292       if(jaddr)
2293         add_stub(LOADB_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2294     }
2295     else
2296       inline_readstub(LOADB_STUB,i,constaddr,i_regs->regmap,rt1[i],ccadj[i],reglist);
2297     if(rt1[i]==TBIT&&!dummy) { // TST.B
2298       signed char sr;
2299       sr=get_reg(i_regs->regmap,SR);
2300       assert(sr>=0); // Liveness analysis?
2301       emit_sh2tstimm(t,imm[i],sr,t);
2302     }
2303   }
2304   if (size==1) { // MOV.W
2305     if(!c||memtarget) {
2306       if(!dummy) {
2307         #ifdef HOST_IMM_ADDR32
2308         if(c)
2309           emit_movswl(constaddr,t);
2310         else
2311         #endif
2312         {
2313           int x=0;
2314           emit_movswl_indexed_map(0,addr,map,t);
2315         }
2316       }
2317       if(jaddr)
2318         add_stub(LOADW_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2319     }
2320     else
2321       inline_readstub(LOADW_STUB,i,constaddr,i_regs->regmap,rt1[i],ccadj[i],reglist);
2322   }
2323   if (size==2) { // MOV.L
2324     if(!c||memtarget) {
2325       if(!dummy) {
2326         #ifdef HOST_IMM_ADDR32
2327         if(c)
2328           emit_readword(constaddr,t);
2329         else
2330         #endif
2331         emit_readword_indexed_map(0,addr,map,t);
2332         emit_rorimm(t,16,t);
2333       }
2334       if(jaddr)
2335         add_stub(LOADL_STUB,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2336     }
2337     else
2338       inline_readstub(LOADL_STUB,i,constaddr,i_regs->regmap,rt1[i],ccadj[i],reglist);
2339   }
2340   if(addrmode[i]==POSTINC) {
2341     if(!((i_regs->wasdoingcp>>s)&1)) {
2342       if(!(i_regs->u&(1LL<<rt2[i]))&&rt1[i]!=rt2[i])
2343         emit_addimm(s,1<<size,s);
2344     }
2345   }
2346   //emit_storereg(rt1[i],tl); // DEBUG
2347   //if(opcode[i]==0x23)
2348   //if(opcode[i]==0x24)
2349   //if(opcode[i]==0x23||opcode[i]==0x24)
2350   /*if(opcode[i]==0x21||opcode[i]==0x23||opcode[i]==0x24)
2351   {
2352     //emit_pusha();
2353     save_regs(0x100f);
2354         emit_readword((int)&last_count,ECX);
2355         #ifdef __i386__
2356         if(get_reg(i_regs->regmap,CCREG)<0)
2357           emit_loadreg(CCREG,HOST_CCREG);
2358         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2359         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2360         emit_writeword(HOST_CCREG,(int)&Count);
2361         #endif
2362         #ifdef __arm__
2363         if(get_reg(i_regs->regmap,CCREG)<0)
2364           emit_loadreg(CCREG,0);
2365         else
2366           emit_mov(HOST_CCREG,0);
2367         emit_add(0,ECX,0);
2368         emit_addimm(0,2*ccadj[i],0);
2369         emit_writeword(0,(int)&Count);
2370         #endif
2371     emit_call((int)memdebug);
2372     //emit_popa();
2373     restore_regs(0x100f);
2374   }*/
2375 }
2376 
store_assemble(int i,struct regstat * i_regs)2377 void store_assemble(int i,struct regstat *i_regs)
2378 {
2379   int s,t,o,map=-1,cache=-1;
2380   int addr,temp;
2381   int offset;
2382   int jaddr=0,jaddr2,type;
2383   int memtarget,c=0,constaddr;
2384   int dualindex=(addrmode[i]==DUALIND);
2385   int size=(opcode[i]==4)?2:(opcode2[i]&3);
2386   int agr=AGEN1+(i&1);
2387   unsigned int hr;
2388   u32 reglist=0;
2389   t=get_reg(i_regs->regmap,rs1[i]);
2390   s=get_reg(i_regs->regmap,rs2[i]);
2391   o=get_reg(i_regs->regmap,rs3[i]);
2392   temp=get_reg(i_regs->regmap,agr);
2393   if(temp<0) temp=get_reg(i_regs->regmap,-1);
2394   offset=imm[i];
2395   for(hr=0;hr<HOST_REGS;hr++) {
2396     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2397   }
2398   //if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2399   if(s>=0) {
2400     if(dualindex)
2401       c=(i_regs->wasdoingcp>>s)&(i_regs->wasdoingcp>>o)&1;
2402     else
2403       c=(i_regs->wasdoingcp>>s)&1;
2404     if(c) {
2405       if(dualindex)
2406         constaddr=cpmap[i][s]+cpmap[i][o];
2407       else
2408         constaddr=cpmap[i][s]+offset;
2409     }
2410     //printf("constaddr=%x offset=%x\n",constaddr,offset);
2411     memtarget=can_direct_write(constaddr);
2412   }
2413   if(!c) {
2414     if(dualindex) {
2415       c=(i_regs->isconst>>rs2[i])&(i_regs->isconst>>rs3[i])&1;
2416     } else {
2417       c=(i_regs->isconst>>rs2[i])&1;
2418     }
2419     if(c) {
2420       if(dualindex)
2421         constaddr=i_regs->constmap[rs2[i]]+i_regs->constmap[rs3[i]];
2422       else
2423         constaddr=i_regs->constmap[rs2[i]]+offset;
2424       //printf("constaddr=%x offset=%x\n",constaddr,offset);
2425       memtarget=can_direct_write(constaddr);
2426       // In this case, the constant is not already loaded into a register
2427       if(can_direct_write(constaddr)) {
2428         emit_movimm(constaddr^(!size),temp);
2429         map=get_reg(i_regs->regmap,MOREG);
2430         if(map<0) map=get_alt_reg(i_regs->regmap,-1);
2431         generate_map_const(constaddr,map);
2432       }
2433     }
2434   }
2435   assert(t>=0);
2436   assert(temp>=0);
2437   if(offset||dualindex||s<0||c) addr=temp;
2438   else addr=s;
2439   //printf("store_assemble: c=%d\n",c);
2440   if(addrmode[i]==PREDEC&&!c&&rt1[i]==rs1[i]) addr=temp; // Old value is written, so decremented address is in a temporary register
2441   if(addrmode[i]==REGIND&&!c&&rs1[i]==rs2[i]) {// Swapped value is written, so unswapped value must be used as the address
2442     emit_mov(addr,temp);addr=temp;
2443   }
2444   if(!c||memtarget)
2445   {
2446     int x=0;
2447     if (!c&&size==0) x=1; // MOV.B
2448     cache=get_reg(i_regs->regmap,MMREG);
2449     map=get_reg(i_regs->regmap,MOREG);
2450     if(map<0) map=get_alt_reg(i_regs->regmap,-1);
2451     assert(map>=0);
2452     assert(map!=temp);
2453     assert(map!=s);
2454     reglist&=~(1<<map);
2455     //if(x) emit_xorimm(addr,x,temp); // for debugging
2456     map=do_map_w(addr,temp,map,cache,x,c,constaddr);
2457     if (!c&&size==0) addr=temp; // MOV.B
2458     do_map_w_branch(map,c,constaddr,&jaddr);
2459     //jaddr=(int)out;emit_jmp(0); // for debugging
2460   }
2461 
2462   if (size==0) { // MOV.B
2463     if(!c||memtarget) {
2464       int x=0;
2465       emit_writebyte_indexed_map(t,x,temp,map,temp);
2466     }
2467     type=STOREB_STUB;
2468   }
2469   if (size==1) { // MOV.W
2470     if(!c||memtarget) {
2471       emit_writehword_indexed_map(t,0,addr,map,temp);
2472     }
2473     type=STOREW_STUB;
2474   }
2475   if (size==2) { // MOV.L
2476     if(!c||memtarget) {
2477       emit_rorimm(t,16,t);
2478       emit_writeword_indexed_map(t,0,addr,map,temp);
2479       if(!(i_regs->u&(1LL<<rs1[i])))
2480         emit_rorimm(t,16,t);
2481     }
2482     type=STOREL_STUB;
2483   }
2484   if(jaddr) {
2485     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2486   } else if(c&&!memtarget) {
2487     inline_writestub(type,i,constaddr,i_regs->regmap,rs1[i],ccadj[i],reglist);
2488   }
2489   if(addrmode[i]==PREDEC) {
2490     assert(s>=0);
2491     if(!((i_regs->wasdoingcp>>s)&1)&&rt1[i]==rs1[i]) emit_addimm(s,-(1<<size),s); // Old value is written, so this "pre-decrement" is really post-decrement
2492   }
2493   //if(opcode[i]==0x2B || opcode[i]==0x3F)
2494   //if(opcode[i]==0x2B || opcode[i]==0x28)
2495   //if(opcode[i]==0x2B || opcode[i]==0x29)
2496   //if(opcode[i]==0x2B)
2497   /*if(opcode[i]==0x2B || opcode[i]==0x28 || opcode[i]==0x29 || opcode[i]==0x3F)
2498   {
2499     //emit_pusha();
2500     save_regs(0x100f);
2501         emit_readword((int)&last_count,ECX);
2502         #ifdef __i386__
2503         if(get_reg(i_regs->regmap,CCREG)<0)
2504           emit_loadreg(CCREG,HOST_CCREG);
2505         emit_add(HOST_CCREG,ECX,HOST_CCREG);
2506         emit_addimm(HOST_CCREG,2*ccadj[i],HOST_CCREG);
2507         emit_writeword(HOST_CCREG,(int)&Count);
2508         #endif
2509         #ifdef __arm__
2510         if(get_reg(i_regs->regmap,CCREG)<0)
2511           emit_loadreg(CCREG,0);
2512         else
2513           emit_mov(HOST_CCREG,0);
2514         emit_add(0,ECX,0);
2515         emit_addimm(0,2*ccadj[i],0);
2516         emit_writeword(0,(int)&Count);
2517         #endif
2518     emit_call((int)memdebug);
2519     //emit_popa();
2520     restore_regs(0x100f);
2521   }*/
2522 }
2523 
rmw_assemble(int i,struct regstat * i_regs)2524 void rmw_assemble(int i,struct regstat *i_regs)
2525 {
2526   int s,o,t,addr,map=-1,cache=-1;
2527   int jaddr=0;
2528   int type;
2529   int memtarget,c=0,constaddr;
2530   int dualindex=(addrmode[i]==GBRIND);
2531   unsigned int hr;
2532   u32 reglist=0;
2533   t=get_reg(i_regs->regmap,-1);
2534   s=get_reg(i_regs->regmap,rs1[i]);
2535   o=get_reg(i_regs->regmap,rs2[i]);
2536   for(hr=0;hr<HOST_REGS;hr++) {
2537     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2538   }
2539   if(s>=0) {
2540     if(dualindex)
2541       c=(i_regs->wasdoingcp>>s)&(i_regs->wasdoingcp>>o)&1;
2542     else
2543       c=(i_regs->wasdoingcp>>s)&1;
2544     if(c) {
2545       if(dualindex)
2546          constaddr=cpmap[i][s]+cpmap[i][o];
2547       else
2548          constaddr=cpmap[i][s];
2549     }
2550     //printf("constaddr=%x offset=%x\n",constaddr,offset);
2551     memtarget=1; // FIXME
2552   }
2553   if(dualindex||s<0||c) addr=t;
2554   else addr=s;
2555   assert(t>=0);
2556   reglist&=~(1<<t);
2557   {
2558     int x=0;
2559     if (!c) x=1; // MOV.B
2560     map=get_reg(i_regs->regmap,MOREG);
2561     cache=get_reg(i_regs->regmap,MMREG);
2562     assert(map>=0);
2563     reglist&=~(1<<map);
2564     map=do_map_w(addr,t,map,cache,x,c,constaddr);
2565     if (!c) addr=t; // MOV.B
2566     do_map_w_branch(map,c,constaddr,&jaddr);
2567   }
2568   if(opcode2[i]==11) type=RMWT_STUB; // TAS.B
2569   if(opcode2[i]==13) type=RMWA_STUB; // AND.B
2570   if(opcode2[i]==14) type=RMWX_STUB; // XOR.B
2571   if(opcode2[i]==15) type=RMWO_STUB; // OR.B
2572   if(!c||memtarget) {
2573     if(opcode2[i]==11) { // TAS.B
2574       signed char sr;
2575       sr=get_reg(i_regs->regmap,SR);
2576       assert(sr>=0); // Liveness analysis?
2577       assert(rt1[i]==TBIT);
2578       if(sr>=0&&!(i_regs->u&(1LL<<TBIT))) emit_sh2tas(addr,map,sr);
2579       else emit_rmw_orimm(addr,map,0x80); // T ignored, set only
2580     }
2581     if(opcode2[i]==13) emit_rmw_andimm(addr,map,imm[i]); // AND.B
2582     if(opcode2[i]==14) emit_rmw_xorimm(addr,map,imm[i]); // XOR.B
2583     if(opcode2[i]==15) emit_rmw_orimm(addr,map,imm[i]); // OR.B
2584   }
2585   if(jaddr)
2586     add_stub(type,jaddr,(int)out,i,addr,(int)i_regs,ccadj[i],reglist);
2587 }
2588 
pcrel_assemble(int i,struct regstat * i_regs)2589 void pcrel_assemble(int i,struct regstat *i_regs)
2590 {
2591   int t,addr,map=-1,cache=-1;
2592   int offset;
2593   int jaddr=0;
2594   int memtarget,c=0,constaddr;
2595   unsigned int hr;
2596   u32 reglist=0;
2597   t=get_reg(i_regs->regmap,rt1[i]);
2598   offset=imm[i];
2599   for(hr=0;hr<HOST_REGS;hr++) {
2600     if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
2601   }
2602   if(i_regs->regmap[HOST_CCREG]==CCREG) reglist&=~(1<<HOST_CCREG);
2603   if(t>=0) {
2604     if(!((i_regs->isdoingcp>>t)&1)) {
2605       int jaddr=0;
2606       // This is to handle the exceptional case where we can not do constant propagation
2607       assert(opcode[i]!=12); // MOVA should always be able to do constant propagation
2608       constaddr=((start+i*2+4)&~3)+imm[i];
2609       if(opcode[i]==9) constaddr=(start+i*2+4)+imm[i]; // MOV.W
2610       assem_debug("Can't do constant propagation, doing PC-relatve load\n");
2611       //int map=get_reg(i_regs->regmap,MOREG);
2612       //int cache=get_reg(i_regs->regmap,MMREG);
2613       //assert(map>=0);
2614       reglist&=~(1<<t);
2615       //reglist&=~(1<<map);
2616       assert(can_direct_read(constaddr));
2617       #ifndef HOST_IMM_ADDR32
2618       emit_movimm(map_address(constaddr),t);
2619       #endif
2620       //map=do_map_r(t,-1,map,cache,0,-1,-1,0,0);
2621       //do_map_r_branch(map,0,0,&jaddr);
2622       //assert(jaddr);
2623       if(opcode[i]==9) { // MOV.W
2624         // direct load
2625         #ifdef HOST_IMM_ADDR32
2626         emit_movswl(map_address(constaddr),t);
2627         #else
2628         //emit_movswl_indexed_map(0,t,map,t);
2629         emit_movswl_indexed(0,t,t);
2630         #endif
2631         //add_stub(LOADW_STUB,jaddr,(int)out,i,t,(int)(i_regs),ccadj[i],reglist);
2632       }
2633       else { // MOV.L
2634         // direct load
2635         #ifdef HOST_IMM_ADDR32
2636         emit_readword(map_address(constaddr),t);
2637         #else
2638         //emit_readword_indexed_map(0,t,map,t);
2639         emit_readword_indexed(0,t,t);
2640         #endif
2641         emit_rorimm(t,16,t);
2642         //add_stub(LOADL_STUB,jaddr,(int)out,i,t,(int)(i_regs),ccadj[i],reglist);
2643       }
2644     }
2645   }
2646 }
2647 
2648 //extern void debug_multiplication(int m,int n,int h,int l);
2649 #ifndef multdiv_assemble
multdiv_assemble(int i,struct regstat * i_regs)2650 void multdiv_assemble(int i,struct regstat *i_regs)
2651 {
2652   if(opcode[i]==0) {
2653     if(opcode2[i]==7) // MUL.L
2654     {
2655       int s1=get_reg(i_regs->regmap,rs1[i]);
2656       int s2=get_reg(i_regs->regmap,rs2[i]);
2657       int t=get_reg(i_regs->regmap,MACL);
2658       if(t>=0) emit_multiply(s1,s2,t);
2659     }
2660     if(opcode2[i]==8) // CLRMAC
2661     {
2662       int t1=get_reg(i_regs->regmap,rt1[i]);
2663       int t2=get_reg(i_regs->regmap,rt2[i]);
2664       if(!(i_regs->u&(1LL<<MACH)))
2665         emit_zeroreg(t1);
2666       if(!(i_regs->u&(1LL<<MACL)))
2667         emit_zeroreg(t2);
2668     }
2669     if(opcode2[i]==9) // DIV0U
2670     {
2671       int sr=get_reg(i_regs->regmap,SR);
2672       emit_andimm(sr,0xfe,sr);
2673     }
2674   }
2675   if(opcode[i]==2) {
2676     if(opcode2[i]==7) // DIV0S
2677     {
2678       int s1=get_reg(i_regs->regmap,rs1[i]);
2679       int s2=get_reg(i_regs->regmap,rs2[i]);
2680       int sr=get_reg(i_regs->regmap,SR);
2681       int temp=get_reg(i_regs->regmap,-1);
2682       assert(s1>=0);
2683       assert(s2>=0);
2684       assert(sr>=0);
2685       emit_div0s(s1,s2,sr,temp);
2686     }
2687     if(opcode2[i]==14||opcode2[i]==15) // MULU.W / MULS.W
2688     {
2689       int s1=get_reg(i_regs->regmap,rs1[i]);
2690       int s2=get_reg(i_regs->regmap,rs2[i]);
2691       int t=get_reg(i_regs->regmap,MACL);
2692       #ifdef HOST_TEMPREG
2693       int temp=HOST_TEMPREG;
2694       #else
2695       int temp=get_reg(i_regs->regmap,-1);
2696       #endif
2697       if(t>=0) {
2698         assert(temp>=0);
2699         if(opcode2[i]==14) { // MULU.W
2700           emit_movzwl_reg(s1,t);
2701           emit_movzwl_reg(s2,temp);
2702         }else{ // MULS.W
2703           emit_movswl_reg(s1,t);
2704           emit_movswl_reg(s2,temp);
2705         }
2706         emit_multiply(t,temp,t);
2707       }
2708       /* DEBUG
2709       emit_pusha();
2710       emit_pushreg(t);
2711       emit_pushreg(t);
2712       emit_pushreg(s2);
2713       emit_pushreg(s1);
2714       emit_call((int)debug_multiplication);
2715       emit_addimm(ESP,16,ESP);
2716       emit_popa();*/
2717     }
2718   }
2719   if(opcode[i]==3) {
2720     int s1=get_reg(i_regs->regmap,rs1[i]);
2721     int s2=get_reg(i_regs->regmap,rs2[i]);
2722     int th=get_reg(i_regs->regmap,MACH);
2723     int tl=get_reg(i_regs->regmap,MACL);
2724     if(th>=0) {
2725       // DMULU.L / DMULS.L
2726       #if defined(__i386__) || defined(__x86_64__)
2727       assert(tl==EAX);
2728       assert(th==EDX);
2729       assert(s1!=EAX); // This would work only if s1 is clean or dead
2730       if(s1!=EAX) emit_mov(s1,EAX);
2731       if(opcode2[i]==5) emit_mul(s2); // DMULU.L
2732       if(opcode2[i]==13) emit_imul(s2); // DMULS.L
2733       #else
2734       if(opcode2[i]==5) emit_umull(s1,s2,th,tl); // DMULU.L
2735       if(opcode2[i]==13) emit_smull(s1,s2,th,tl); // DMULS.L
2736       #endif
2737     }else if(tl>=0) {
2738       // MACH is unneeded, 32-bit result only
2739       emit_multiply(s1,s2,tl);
2740     }
2741     /* DEBUG
2742     emit_pusha();
2743     emit_pushreg(tl);
2744     emit_pushreg(th);
2745     emit_pushreg(s2);
2746     emit_pushreg(s1);
2747     emit_call((int)debug_multiplication);
2748     emit_addimm(ESP,16,ESP);
2749     emit_popa();*/
2750   }
2751 }
2752 #endif
2753 
mov_assemble(int i,struct regstat * i_regs)2754 void mov_assemble(int i,struct regstat *i_regs)
2755 {
2756   signed char s,t;
2757   t=get_reg(i_regs->regmap,rt1[i]);
2758   //assert(t>=0);
2759   if(t>=0) {
2760     s=get_reg(i_regs->regmap,rs1[i]);
2761     if(s>=0) {if(s!=t) emit_mov(s,t);}
2762     else emit_loadreg(rs1[i],t);
2763   }
2764 }
2765 
ext_assemble(int i,struct regstat * i_regs)2766 void ext_assemble(int i,struct regstat *i_regs)
2767 {
2768   signed char s,t;
2769   t=get_reg(i_regs->regmap,rt1[i]);
2770   //assert(t>=0);
2771   if(t>=0) {
2772     s=get_reg(i_regs->regmap,rs1[i]);
2773     if(s>=0) {
2774       if(opcode2[i]==12) emit_movzbl_reg(s,t);
2775       if(opcode2[i]==13) emit_movzwl_reg(s,t);
2776       if(opcode2[i]==14) emit_movsbl_reg(s,t);
2777       if(opcode2[i]==15) emit_movswl_reg(s,t);
2778     }
2779     else
2780     {
2781       emit_loadreg(rs1[i],t); // Fix - do byte/halfword loads?
2782       if(opcode2[i]==12) emit_movzbl_reg(t,t);
2783       if(opcode2[i]==13) emit_movzwl_reg(t,t);
2784       if(opcode2[i]==14) emit_movsbl_reg(t,t);
2785       if(opcode2[i]==15) emit_movswl_reg(t,t);
2786     }
2787   }
2788 }
2789 
flags_assemble(int i,struct regstat * i_regs)2790 void flags_assemble(int i,struct regstat *i_regs)
2791 {
2792   signed char sr,t;
2793   sr=get_reg(i_regs->regmap,SR);
2794   if(opcode2[i]==8) { // CLRT/SETT
2795     if(opcode3[i]==0) emit_andimm(sr,~1,sr);
2796     if(opcode3[i]==1) emit_orimm(sr,1,sr);
2797   }else
2798   if(opcode2[i]==9) { // MOVT
2799     t=get_reg(i_regs->regmap,rt1[i]);
2800     if(t>=0)
2801       emit_andimm(sr,1,t);
2802   }
2803 }
2804 
complex_assemble(int i,struct regstat * i_regs)2805 void complex_assemble(int i,struct regstat *i_regs)
2806 {
2807   if(opcode[i]==3&&opcode2[i]==4) { // DIV1
2808     emit_call((pointer)div1);
2809   }
2810   if(opcode[i]==0&&opcode2[i]==15) { // MAC.L
2811     load_regs(i_regs->regmap_entry,i_regs->regmap,MACL,MACH,MACH);
2812     // If both registers are the same, the register is incremented twice.
2813     // Pre-increment one of the function arguments.
2814     #if defined(__i386__) || defined(__x86_64__)
2815     if(rs1[i]==rs2[i]) {emit_mov(EDI,EBP);emit_addimm(EDI,4,EDI);}
2816     #else
2817     #if defined(__arm__)
2818     if(rs1[i]==rs2[i]) {emit_mov(6,5);emit_addimm(6,4,6);}
2819     #else
2820     // FIXME
2821     assert(0);
2822     #endif
2823     #endif
2824 /* DEBUG
2825   //if(i_regmap[HOST_CCREG]!=CCREG) {
2826     emit_loadreg(CCREG,ECX);
2827     emit_addimm(ECX,CLOCK_DIVIDER*(ccadj[i]),ECX);
2828     output_byte(0x03);
2829     output_modrm(1,4,ECX);
2830     output_sib(0,4,4);
2831     output_byte(4);
2832     emit_writeword(ECX,slave?(int)&SSH2->cycles:(int)&MSH2->cycles);
2833 //  }*/
2834     emit_call((pointer)macl);
2835   }
2836   if(opcode[i]==4&&opcode2[i]==15) { // MAC.W
2837     load_regs(i_regs->regmap_entry,i_regs->regmap,MACL,MACH,MACH);
2838     // If both registers are the same, the register is incremented twice.
2839     // Pre-increment one of the function arguments.
2840     #if defined(__i386__) || defined(__x86_64__)
2841     if(rs1[i]==rs2[i]) {emit_mov(EDI,EBP);emit_addimm(EDI,2,EDI);}
2842     #else
2843     #if defined(__arm__)
2844     if(rs1[i]==rs2[i]) {emit_mov(6,5);emit_addimm(6,2,6);}
2845     #else
2846     // FIXME
2847     assert(0);
2848     #endif
2849     #endif
2850 /* DEBUG
2851   //if(i_regmap[HOST_CCREG]!=CCREG) {
2852     emit_loadreg(CCREG,ECX);
2853     emit_addimm(ECX,CLOCK_DIVIDER*(ccadj[i]),ECX);
2854     output_byte(0x03);
2855     output_modrm(1,4,ECX);
2856     output_sib(0,4,4);
2857     output_byte(4);
2858     emit_writeword(ECX,slave?(int)&SSH2->cycles:(int)&MSH2->cycles);
2859 //  }*/
2860     emit_call((pointer)macw);
2861   }
2862 }
2863 
ds_assemble(int i,struct regstat * i_regs)2864 void ds_assemble(int i,struct regstat *i_regs)
2865 {
2866   is_delayslot=1;
2867   switch(itype[i]) {
2868     case ALU:
2869       alu_assemble(i,i_regs);break;
2870     case IMM8:
2871       imm8_assemble(i,i_regs);break;
2872     case SHIFTIMM:
2873       shiftimm_assemble(i,i_regs);break;
2874     case LOAD:
2875       load_assemble(i,i_regs);break;
2876     case STORE:
2877       store_assemble(i,i_regs);break;
2878     case RMW:
2879       rmw_assemble(i,i_regs);break;
2880     case PCREL:
2881       pcrel_assemble(i,i_regs);break;
2882     case MULTDIV:
2883       multdiv_assemble(i,i_regs);break;
2884     case MOV:
2885       mov_assemble(i,i_regs);break;
2886     case EXT:
2887       ext_assemble(i,i_regs);break;
2888     case FLAGS:
2889       flags_assemble(i,i_regs);break;
2890     case COMPLEX:
2891       complex_assemble(i,i_regs);break;
2892     case SYSTEM:
2893     case SYSCALL:
2894     case UJUMP:
2895     case RJUMP:
2896     case CJUMP:
2897     case SJUMP:
2898       printf("Jump in the delay slot.  This is probably a bug.\n");
2899   }
2900   is_delayslot=0;
2901 }
2902 
2903 // Is the branch target a valid internal jump?
internal_branch(int addr)2904 int internal_branch(int addr)
2905 {
2906   if(addr&1) return 0; // Indirect (register) jump
2907   if(addr>=start && addr<start+slen*2-2)
2908   {
2909     return 1;
2910   }
2911   return 0;
2912 }
2913 
2914 #ifndef wb_invalidate
wb_invalidate(signed char pre[],signed char entry[],u32 dirty,u64 u)2915 void wb_invalidate(signed char pre[],signed char entry[],u32 dirty, u64 u)
2916 {
2917   int hr;
2918   for(hr=0;hr<HOST_REGS;hr++) {
2919     if(hr!=EXCLUDE_REG) {
2920       if(pre[hr]!=entry[hr]) {
2921         if(pre[hr]>=0) {
2922           if((dirty>>hr)&1) {
2923             if(!((u>>pre[hr])&1)) {
2924               int nr;
2925               if((nr=get_reg(entry,pre[hr]))<0) {
2926                 emit_storereg(pre[hr],hr);
2927               }else{
2928                 // Register move would overwrite another register, so write back
2929                 if(pre[nr]>=0)
2930                   if(get_reg(entry,pre[nr])>=0)
2931                     emit_storereg(pre[hr],hr);
2932               }
2933             }
2934           }
2935         }
2936       }
2937     }
2938   }
2939   // Move from one register to another (no writeback)
2940   for(hr=0;hr<HOST_REGS;hr++) {
2941     if(hr!=EXCLUDE_REG) {
2942       if(pre[hr]!=entry[hr]) {
2943         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
2944           int nr;
2945           if((nr=get_reg(entry,pre[hr]))>=0) {
2946             if(pre[nr]<0||get_reg(entry,pre[nr])<0) {
2947               emit_mov(hr,nr);
2948             }
2949           }
2950         }
2951       }
2952     }
2953   }
2954   // Reload registers that couldn't be directly moved
2955   for(hr=0;hr<HOST_REGS;hr++) {
2956     if(hr!=EXCLUDE_REG) {
2957       if(pre[hr]!=entry[hr]) {
2958         if(pre[hr]>=0&&(pre[hr]&63)<TEMPREG) {
2959           int nr;
2960           if((nr=get_reg(entry,pre[hr]))>=0) {
2961             if(pre[nr]>=0) {
2962               if(get_reg(entry,pre[nr])>=0) {
2963                 emit_loadreg(pre[hr],nr);
2964               }
2965             }
2966           }
2967         }
2968       }
2969     }
2970   }
2971 }
2972 #endif
2973 
2974 // Load the specified registers
2975 // This only loads the registers given as arguments because
2976 // we don't want to load things that will be overwritten
load_regs(signed char entry[],signed char regmap[],int rs1,int rs2,int rs3)2977 void load_regs(signed char entry[],signed char regmap[],int rs1,int rs2,int rs3)
2978 {
2979   int hr;
2980   if(rs1==TBIT) rs1=SR;
2981   if(rs2==TBIT) rs2=SR;
2982   if(rs3==TBIT) rs3=SR;
2983   // Load 32-bit regs
2984   for(hr=0;hr<HOST_REGS;hr++) {
2985     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
2986       if(entry[hr]!=regmap[hr]) {
2987         if(regmap[hr]==rs1||regmap[hr]==rs2||regmap[hr]==rs3)
2988         {
2989           emit_loadreg(regmap[hr],hr);
2990         }
2991       }
2992     }
2993   }
2994 }
2995 
2996 // Load registers prior to the start of a loop
2997 // so that they are not loaded within the loop
loop_preload(signed char pre[],signed char entry[])2998 static void loop_preload(signed char pre[],signed char entry[])
2999 {
3000   int hr;
3001   for(hr=0;hr<HOST_REGS;hr++) {
3002     if(hr!=EXCLUDE_REG) {
3003       if(pre[hr]!=entry[hr]) {
3004         if(entry[hr]>=0) {
3005           if(get_reg(pre,entry[hr])<0) {
3006             assem_debug("loop preload:\n");
3007             //printf("loop preload: %d\n",hr);
3008             if(entry[hr]<TEMPREG)
3009             {
3010               emit_loadreg(entry[hr],hr);
3011             }
3012           }
3013         }
3014       }
3015     }
3016   }
3017 }
3018 
3019 // Generate address for load/store instruction
address_generation(int i,struct regstat * i_regs,signed char entry[])3020 void address_generation(int i,struct regstat *i_regs,signed char entry[])
3021 {
3022   if(itype[i]==LOAD||itype[i]==STORE||itype[i]==RMW) {
3023     int rs,ri;
3024     int rm;
3025     int ra;
3026     int agr=AGEN1+(i&1);
3027     int mgr=MGEN1+(i&1);
3028     if(itype[i]==LOAD) {
3029       ra=get_reg(i_regs->regmap,rt1[i]);
3030       if(ra<0||rt1[i]==TBIT) ra=get_reg(i_regs->regmap,-1);
3031       assert(ra>=0);
3032     }
3033     if(itype[i]==STORE||itype[i]==RMW) {
3034       ra=get_reg(i_regs->regmap,agr);
3035       if(ra<0) ra=get_reg(i_regs->regmap,-1);
3036       assert(ra>=0);
3037     }
3038     if(itype[i]==STORE) {
3039       rs=get_reg(i_regs->regmap,rs2[i]);
3040       ri=get_reg(i_regs->regmap,rs3[i]);
3041     }else{
3042       rs=get_reg(i_regs->regmap,rs1[i]);
3043       ri=get_reg(i_regs->regmap,rs2[i]);
3044     }
3045     rm=get_reg(i_regs->regmap,MOREG);
3046     if(rm<0) rm=get_alt_reg(i_regs->regmap,-1);
3047     if(ra>=0) {
3048       int offset=imm[i];
3049       int c;
3050       u32 constaddr;
3051       if(addrmode[i]==DUALIND||addrmode[i]==GBRIND) {
3052         c=(i_regs->wasdoingcp>>rs)&(i_regs->wasdoingcp>>ri)&1;
3053         constaddr=cpmap[i][rs]+cpmap[i][ri];
3054       }else{
3055         c=(i_regs->wasdoingcp>>rs)&1;
3056         constaddr=cpmap[i][rs]+offset;
3057         if(addrmode[i]==POSTINC) constaddr-=1<<((opcode[i]==4)?2:(opcode2[i]&3));
3058       }
3059       if(addrmode[i]==PREDEC&&!c) {
3060         if(rt1[i]!=rs1[i]) emit_addimm(rs,-(1<<((opcode[i]==4)?2:(opcode2[i]&3))),rs);
3061         else offset=-(1<<((opcode[i]==4)?2:(opcode2[i]&3)));
3062       }
3063       if(rs<0) {
3064         if(itype[i]==LOAD) {
3065           if(!entry||entry[ra]!=rs1[i])
3066             emit_loadreg(rs1[i],ra);
3067         }
3068         if(itype[i]==STORE) {
3069           if(!entry||entry[ra]!=rs2[i])
3070             emit_loadreg(rs2[i],ra);
3071         }
3072         //if(!entry||entry[ra]!=rs1[i])
3073         //  printf("poor load scheduling!\n");
3074       }
3075       else if(c) {
3076         // Stores to memory go thru the mapper to detect self-modifying
3077         // code, loads don't.
3078         if(rm>=0) {
3079           if(!entry||entry[rm]!=mgr) {
3080             if(itype[i]==STORE) {
3081               if(can_direct_write(constaddr))
3082                 generate_map_const(constaddr,rm);
3083             }
3084             if(itype[i]==RMW) {
3085               generate_map_const(constaddr,rm);
3086             }
3087           }
3088         }
3089         if((opcode2[i]&3)==0||itype[i]==RMW) constaddr^=1; // byteswap for little-endian
3090         if(rs1[i]!=rt1[i]||itype[i]!=LOAD||addrmode[i]==DUALIND||addrmode[i]==GBRIND) {
3091           if(!entry||entry[ra]!=agr) {
3092             #ifdef HOST_IMM_ADDR32
3093             if(itype[i]==RMW || (itype[i]==STORE && can_direct_write(constaddr)))
3094             #endif
3095             {
3096               if(itype[i]==LOAD&&can_direct_read(constaddr))
3097                 emit_movimm(map_address(constaddr),ra);
3098               else
3099                 emit_movimm(constaddr,ra);
3100             }
3101           } // else did it in the previous cycle
3102         } // else load_consts already did it
3103       }
3104       if(!c) {
3105         if(rs>=0) {
3106           if(addrmode[i]==DUALIND||addrmode[i]==GBRIND)
3107             emit_add(rs,ri,ra);
3108           else
3109             if(offset) emit_addimm(rs,offset,ra);
3110         }else{
3111           if(addrmode[i]==DUALIND||addrmode[i]==GBRIND)
3112             emit_add(ra,ri,ra);
3113           else
3114             if(offset) emit_addimm(ra,offset,ra);
3115         }
3116       }
3117     }
3118   }
3119   // Preload constants for next instruction
3120   if(itype[i+1]==LOAD||itype[i+1]==STORE||itype[i+1]==RMW) {
3121     int agr,ra,rm;
3122     #ifndef HOST_IMM_ADDR32
3123     // Mapper entry
3124     agr=MGEN1+((i+1)&1);
3125     rm=get_reg(i_regs->regmap,agr);
3126     if(rm>=0) {
3127       int rs,ri;
3128       if(itype[i+1]==STORE) {
3129         rs=get_reg(regs[i+1].regmap,rs2[i+1]);
3130         ri=get_reg(regs[i+1].regmap,rs3[i+1]);
3131       }else{
3132         rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3133         ri=get_reg(regs[i+1].regmap,rs2[i+1]);
3134       }
3135       //int rm=get_reg(i_regs->regmap,MOREG);
3136       int offset=imm[i+1];
3137       int c;
3138       u32 constaddr;
3139       if(addrmode[i+1]==DUALIND||addrmode[i+1]==GBRIND) {
3140         c=(regs[i+1].wasdoingcp>>rs)&(regs[i+1].wasdoingcp>>ri)&1;
3141         constaddr=cpmap[i+1][rs]+cpmap[i+1][ri];
3142       }else{
3143         c=(regs[i+1].wasdoingcp>>rs)&1;
3144         constaddr=cpmap[i+1][rs]+offset;
3145         if(addrmode[i+1]==POSTINC) constaddr-=1<<((opcode[i+1]==4)?2:(opcode2[i+1]&3));
3146       }
3147       if((opcode2[i+1]&3)==0||itype[i+1]==RMW) constaddr^=1; // byteswap for little-endian
3148       if(c) {
3149         // Stores to memory go thru the mapper to detect self-modifying
3150         // code, loads don't.
3151         if(itype[i+1]==STORE) {
3152           if(can_direct_write(constaddr))
3153             generate_map_const(constaddr,rm);
3154         }
3155         if(itype[i+1]==RMW) {
3156           generate_map_const(constaddr,rm);
3157         }
3158       }
3159     }
3160     #endif
3161     // Actual address
3162     agr=AGEN1+((i+1)&1);
3163     ra=get_reg(i_regs->regmap,agr);
3164     if(ra>=0) {
3165       int c;
3166       int offset;
3167       int rs,ri;
3168       u32 constaddr;
3169       if(itype[i+1]==STORE) {
3170         rs=get_reg(regs[i+1].regmap,rs2[i+1]);
3171         ri=get_reg(regs[i+1].regmap,rs3[i+1]);
3172       }else{
3173         rs=get_reg(regs[i+1].regmap,rs1[i+1]);
3174         ri=get_reg(regs[i+1].regmap,rs2[i+1]);
3175       }
3176       offset=imm[i+1];
3177       if(addrmode[i+1]==DUALIND||addrmode[i+1]==GBRIND) {
3178         c=(regs[i+1].wasdoingcp>>rs)&(regs[i+1].wasdoingcp>>ri)&1;
3179         constaddr=cpmap[i+1][rs]+cpmap[i+1][ri];
3180       }else{
3181         c=(regs[i+1].wasdoingcp>>rs)&1;
3182         constaddr=cpmap[i+1][rs]+offset;
3183         if(addrmode[i+1]==POSTINC) constaddr-=1<<((opcode[i+1]==4)?2:(opcode2[i+1]&3));
3184       }
3185       if((opcode2[i+1]&3)==0||itype[i+1]==RMW) constaddr^=1; // byteswap for little-endian
3186       if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD||addrmode[i+1]==DUALIND||addrmode[i+1]==GBRIND)) {
3187       //if(c&&(rs1[i+1]!=rt1[i+1]||itype[i+1]!=LOAD)) {
3188         #ifdef HOST_IMM_ADDR32
3189         if(itype[i+1]==RMW || (itype[i+1]==STORE && can_direct_write(constaddr)))
3190         #endif
3191         {
3192           if(itype[i+1]==LOAD&&can_direct_read(constaddr))
3193             emit_movimm(map_address(constaddr),ra);
3194           else
3195             emit_movimm(constaddr,ra);
3196         }
3197       }
3198     }
3199   }
3200 }
3201 
get_final_value(int hr,int i,int * value)3202 int get_final_value(int hr, int i, int *value)
3203 {
3204   int reg=regs[i].regmap[hr];
3205   while(i<slen-1) {
3206     if(regs[i+1].regmap[hr]!=reg) break;
3207     if(!((regs[i+1].isdoingcp>>hr)&1)) break;
3208     if(bt[i+1]) break;
3209     i++;
3210   }
3211   if(i<slen-1) {
3212     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
3213       *value=cpmap[i][hr];
3214       return 1;
3215     }
3216     if(!bt[i+1]) {
3217       if(itype[i+1]==UJUMP||itype[i+1]==RJUMP||itype[i+1]==SJUMP) {
3218         // Load in delay slot, out-of-order execution
3219         if(itype[i+2]==LOAD&&rs1[i+2]==reg&&rt1[i+2]==reg&&((regs[i+1].wasdoingcp>>hr)&1))
3220         {
3221           if(addrmode[i+2]==DUALIND||addrmode[i+2]==GBRIND) {
3222             *value=cpmap[i][hr];
3223             return 1;
3224           }
3225           // Don't load address if can_direct_read and HOST_IMM_ADDR32
3226           #ifdef HOST_IMM_ADDR32
3227           if(can_direct_read(cpmap[i][hr]+imm[i+2])) return 0;
3228           #endif
3229           // Precompute load address
3230           *value=cpmap[i][hr]+imm[i+2];
3231           if(can_direct_read(*value)) *value=map_address(*value);
3232           if((opcode2[i+2]&3)==0) *value^=1; // byteswap for little-endian
3233           return 1;
3234         }
3235       }
3236       if(itype[i+1]==LOAD&&rs1[i+1]==reg&&rt1[i+1]==reg)
3237       {
3238         if(addrmode[i+1]==DUALIND||addrmode[i+1]==GBRIND) {
3239           *value=cpmap[i][hr];
3240           return 1;
3241         }
3242         // Don't load address if can_direct_read and HOST_IMM_ADDR32
3243         #ifdef HOST_IMM_ADDR32
3244         if(can_direct_read(cpmap[i][hr]+imm[i+1])) return 0;
3245         #endif
3246         // Precompute load address
3247         *value=cpmap[i][hr]+imm[i+1];
3248         if(can_direct_read(*value)) *value=map_address(*value);
3249         if((opcode2[i+1]&3)==0) *value^=1; // byteswap for little-endian
3250         //printf("c=%x imm=%x\n",(int)cpmap[i][hr],imm[i+1]);
3251         return 1;
3252       }
3253     }
3254   }
3255   *value=cpmap[i][hr];
3256   //printf("c=%x\n",(int)cpmap[i][hr]);
3257   if(i==slen-1) return 1;
3258   return !((unneeded_reg[i+1]>>reg)&1);
3259 }
3260 
3261 // Load registers with known constants
load_consts(signed char pre[],signed char regmap[],int i)3262 void load_consts(signed char pre[],signed char regmap[],int i)
3263 {
3264   int hr;
3265   // Load 32-bit regs
3266   for(hr=0;hr<HOST_REGS;hr++) {
3267     if(hr!=EXCLUDE_REG&&regmap[hr]>=0) {
3268       if(i==0||!((regs[i-1].isdoingcp>>hr)&1)||pre[hr]!=regmap[hr]||bt[i]) {
3269         if(((regs[i].isdoingcp>>hr)&1)&&regmap[hr]<64&&regmap[hr]>=0) {
3270           int value;
3271           if(get_final_value(hr,i,&value)) {
3272             emit_movimm(value,hr);
3273           }
3274         }
3275       }
3276     }
3277   }
3278 }
load_all_consts(signed char regmap[],u32 dirty,int i)3279 void load_all_consts(signed char regmap[],u32 dirty,int i)
3280 {
3281   int hr;
3282   // Load 32-bit regs
3283   for(hr=0;hr<HOST_REGS;hr++) {
3284     if(hr!=EXCLUDE_REG&&regmap[hr]>=0&&((dirty>>hr)&1)) {
3285       if(((regs[i].isdoingcp>>hr)&1)&&regmap[hr]<64&&regmap[hr]>=0) {
3286         int value=cpmap[i][hr];
3287         emit_movimm(value,hr);
3288       }
3289     }
3290   }
3291 }
3292 
3293 // Write out all dirty registers (except cycle count)
wb_dirtys(signed char i_regmap[],u32 i_dirty)3294 void wb_dirtys(signed char i_regmap[],u32 i_dirty)
3295 {
3296   int hr;
3297   for(hr=0;hr<HOST_REGS;hr++) {
3298     if(hr!=EXCLUDE_REG) {
3299       if(i_regmap[hr]>=0) {
3300         if(i_regmap[hr]!=CCREG) {
3301           if((i_dirty>>hr)&1) {
3302             emit_storereg(i_regmap[hr],hr);
3303           }
3304         }
3305       }
3306     }
3307   }
3308 }
3309 // Write out dirty registers that we need to reload (pair with load_needed_regs)
3310 // This writes the registers not written by store_regs_bt
wb_needed_dirtys(signed char i_regmap[],u32 i_dirty,int addr)3311 void wb_needed_dirtys(signed char i_regmap[],u32 i_dirty,int addr)
3312 {
3313   int hr;
3314   int t=(addr-start)>>1;
3315   for(hr=0;hr<HOST_REGS;hr++) {
3316     if(hr!=EXCLUDE_REG) {
3317       if(i_regmap[hr]>=0) {
3318         if(i_regmap[hr]!=CCREG) {
3319           if((i_regmap[hr]==regs[t].regmap_entry[hr] && ((regs[t].dirty>>hr)&1)) || i_regmap[hr]==SR || i_regmap[hr]==15) {
3320             if((i_dirty>>hr)&1) {
3321               emit_storereg(i_regmap[hr],hr);
3322             }
3323           }
3324         }
3325       }
3326     }
3327   }
3328 }
3329 
3330 // Load all registers (except cycle count)
load_all_regs(signed char i_regmap[])3331 void load_all_regs(signed char i_regmap[])
3332 {
3333   int hr;
3334   for(hr=0;hr<HOST_REGS;hr++) {
3335     if(hr!=EXCLUDE_REG) {
3336       if(i_regmap[hr]>=0 && i_regmap[hr]<TEMPREG && i_regmap[hr]!=CCREG)
3337       {
3338         emit_loadreg(i_regmap[hr],hr);
3339       }
3340     }
3341   }
3342 }
3343 
3344 // Load all current registers also needed by next instruction
load_needed_regs(signed char i_regmap[],signed char next_regmap[])3345 void load_needed_regs(signed char i_regmap[],signed char next_regmap[])
3346 {
3347   int hr;
3348   for(hr=0;hr<HOST_REGS;hr++) {
3349     if(hr!=EXCLUDE_REG) {
3350       if(get_reg(next_regmap,i_regmap[hr])>=0) {
3351         if(i_regmap[hr]>=0 && i_regmap[hr]<TEMPREG && i_regmap[hr]!=CCREG)
3352         {
3353           emit_loadreg(i_regmap[hr],hr);
3354         }
3355       }
3356     }
3357   }
3358 }
3359 
3360 // Load all regs, storing cycle count if necessary
load_regs_entry(int t)3361 void load_regs_entry(int t)
3362 {
3363   int hr;
3364   if(is_ds[t]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER,HOST_CCREG);
3365   else if(ccadj[t]) emit_addimm(HOST_CCREG,-ccadj[t]*CLOCK_DIVIDER,HOST_CCREG);
3366   if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3367     emit_storereg(CCREG,HOST_CCREG);
3368   }
3369   // Load 32-bit regs
3370   for(hr=0;hr<HOST_REGS;hr++) {
3371     if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3372       if(regs[t].regmap_entry[hr]!=CCREG)
3373       {
3374         emit_loadreg(regs[t].regmap_entry[hr],hr);
3375       }
3376     }
3377   }
3378 }
3379 
3380 // Store dirty registers prior to branch
store_regs_bt(signed char i_regmap[],u32 i_dirty,int addr)3381 void store_regs_bt(signed char i_regmap[],u32 i_dirty,int addr)
3382 {
3383   if(internal_branch(addr))
3384   {
3385     int t=(addr-start)>>1;
3386     int hr;
3387     for(hr=0;hr<HOST_REGS;hr++) {
3388       if(hr!=EXCLUDE_REG) {
3389         if(i_regmap[hr]>=0 && i_regmap[hr]!=CCREG) {
3390           if(i_regmap[hr]!=regs[t].regmap_entry[hr] || !((regs[t].dirty>>hr)&1) ) {
3391             if((i_dirty>>hr)&1) {
3392               if(!((unneeded_reg[t]>>i_regmap[hr])&1)) {
3393                 emit_storereg(i_regmap[hr],hr);
3394               }
3395             }
3396           }
3397         }
3398       }
3399     }
3400   }
3401   else
3402   {
3403     // Branch out of this block, write out all dirty regs
3404     wb_dirtys(i_regmap,i_dirty);
3405   }
3406 }
3407 
3408 // Load all needed registers for branch target
load_regs_bt(signed char i_regmap[],u32 i_dirty,int addr)3409 void load_regs_bt(signed char i_regmap[],u32 i_dirty,int addr)
3410 {
3411   //if(addr>=start && addr<(start+slen*4))
3412   if(internal_branch(addr))
3413   {
3414     int t=(addr-start)>>1;
3415     int hr;
3416     // Store the cycle count before loading something else
3417     if(i_regmap[HOST_CCREG]!=CCREG) {
3418       assert(i_regmap[HOST_CCREG]==-1);
3419     }
3420     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) {
3421       emit_storereg(CCREG,HOST_CCREG);
3422     }
3423     // Load 32-bit regs
3424     for(hr=0;hr<HOST_REGS;hr++) {
3425       if(hr!=EXCLUDE_REG&&regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG) {
3426         if(i_regmap[hr]!=regs[t].regmap_entry[hr] ) {
3427           if(regs[t].regmap_entry[hr]!=CCREG)
3428           {
3429             emit_loadreg(regs[t].regmap_entry[hr],hr);
3430           }
3431         }
3432       }
3433     }
3434   }
3435 }
3436 
match_bt(signed char i_regmap[],u32 i_dirty,int addr)3437 int match_bt(signed char i_regmap[],u32 i_dirty,int addr)
3438 {
3439   if(addr>=start && addr<start+slen*2-2)
3440   {
3441     int t=(addr-start)>>1;
3442     int hr;
3443     if(regs[t].regmap_entry[HOST_CCREG]!=CCREG) return 0;
3444     for(hr=0;hr<HOST_REGS;hr++)
3445     {
3446       if(hr!=EXCLUDE_REG)
3447       {
3448         if(i_regmap[hr]!=regs[t].regmap_entry[hr])
3449         {
3450           if(regs[t].regmap_entry[hr]>=0&&regs[t].regmap_entry[hr]<TEMPREG)
3451           {
3452             return 0;
3453           }
3454           else
3455           if((i_dirty>>hr)&1)
3456           {
3457             if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3458               return 0;
3459           }
3460         }
3461         else // Same register but is it dirty?
3462         if(i_regmap[hr]>=0)
3463         {
3464           if(!((regs[t].dirty>>hr)&1))
3465           {
3466             if((i_dirty>>hr)&1)
3467             {
3468               if(!((unneeded_reg[t]>>i_regmap[hr])&1))
3469               {
3470                 //printf("%x: dirty no match\n",addr);
3471                 return 0;
3472               }
3473             }
3474           }
3475         }
3476       }
3477     }
3478     // Delay slots require additional processing, so do not match
3479     if(is_ds[t]) return 0;
3480   }
3481   else
3482   {
3483     int hr;
3484     for(hr=0;hr<HOST_REGS;hr++)
3485     {
3486       if(hr!=EXCLUDE_REG)
3487       {
3488         if(i_regmap[hr]>=0)
3489         {
3490           if(hr!=HOST_CCREG||i_regmap[hr]!=CCREG)
3491           {
3492             if((i_dirty>>hr)&1)
3493             {
3494               return 0;
3495             }
3496           }
3497         }
3498       }
3499     }
3500   }
3501   return 1;
3502 }
3503 
3504 // Used when a branch jumps into the delay slot of another branch
ds_assemble_entry(int i)3505 void ds_assemble_entry(int i)
3506 {
3507   int t=(ba[i]-start)>>1;
3508   if(!instr_addr[t]) instr_addr[t]=(pointer)out;
3509   assem_debug("Assemble delay slot at %x\n",ba[i]);
3510   assem_debug("<->\n");
3511   if(regs[t].regmap_entry[HOST_CCREG]==CCREG&&regs[t].regmap[HOST_CCREG]!=CCREG)
3512     wb_register(CCREG,regs[t].regmap_entry,regs[t].wasdirty);
3513   load_regs(regs[t].regmap_entry,regs[t].regmap,rs1[t],rs2[t],rs3[t]);
3514   address_generation(t,&regs[t],regs[t].regmap_entry);
3515   if(itype[t]==LOAD||itype[t]==STORE)
3516     load_regs(regs[t].regmap_entry,regs[t].regmap,MMREG,MMREG,MMREG);
3517   is_delayslot=0;
3518   switch(itype[t]) {
3519     case ALU:
3520       alu_assemble(t,&regs[t]);break;
3521     case IMM8:
3522       imm8_assemble(t,&regs[t]);break;
3523     case SHIFTIMM:
3524       shiftimm_assemble(t,&regs[t]);break;
3525     case LOAD:
3526       load_assemble(t,&regs[t]);break;
3527     case STORE:
3528       store_assemble(t,&regs[t]);break;
3529     case RMW:
3530       rmw_assemble(t,&regs[t]);break;
3531     case PCREL:
3532       pcrel_assemble(t,&regs[t]);break;
3533     case MULTDIV:
3534       multdiv_assemble(t,&regs[t]);break;
3535     case MOV:
3536       mov_assemble(t,&regs[t]);break;
3537     case EXT:
3538       ext_assemble(i,&regs[t]);break;
3539     case FLAGS:
3540       flags_assemble(i,&regs[t]);break;
3541     case COMPLEX:
3542       complex_assemble(i,&regs[t]);break;
3543     case SYSTEM:
3544     case SYSCALL:
3545     case UJUMP:
3546     case RJUMP:
3547     case CJUMP:
3548     case SJUMP:
3549       printf("Jump in the delay slot.  This is probably a bug.\n");
3550   }
3551   store_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+2);
3552   load_regs_bt(regs[t].regmap,regs[t].dirty,ba[i]+2);
3553   if(internal_branch(ba[i]+2))
3554     assem_debug("branch: internal\n");
3555   else
3556     assem_debug("branch: external\n");
3557   assert(internal_branch(ba[i]+2));
3558   add_to_linker((int)out,ba[i]+2,internal_branch(ba[i]+2));
3559   emit_jmp(0);
3560 }
3561 
do_cc(int i,signed char i_regmap[],int * adj,int addr,int taken,int invert)3562 void do_cc(int i,signed char i_regmap[],int *adj,int addr,int taken,int invert)
3563 {
3564   int count;
3565   int jaddr;
3566   int idle=0;
3567   if(itype[i]==RJUMP)
3568   {
3569     *adj=0;
3570   }
3571   //if(ba[i]>=start && ba[i]<(start+slen*4))
3572   if(internal_branch(ba[i]))
3573   {
3574     int t=(ba[i]-start)>>1;
3575     if(is_ds[t]) *adj=ccadj[t+1]-cycles[t]; // Branch into delay slot adds an extra cycle
3576     else *adj=ccadj[t];
3577   }
3578   else
3579   {
3580     *adj=0;
3581   }
3582   if(itype[i]==CJUMP) *adj-=2+cycles[i]; // Two extra cycles for taken BT/BF
3583   if(itype[i]==SJUMP) *adj-=1+cycles[i]+cycles[i+1]; // One extra cycle for taken BT/BF with delay slot
3584   count=ccadj[i]+((taken==NODS)?0:cycles[i]+cycles[i+1]);
3585   if(taken==TAKEN && i==(ba[i]-start)>>1 && source[i+1]==0) {
3586     // Idle loop
3587     // FIXME
3588     //if(count&1) emit_addimm_and_set_flags(2*(count+2),HOST_CCREG);
3589     idle=(int)out;
3590     //emit_subfrommem(&idlecount,HOST_CCREG); // Count idle cycles
3591     emit_andimm(HOST_CCREG,3,HOST_CCREG);
3592     jaddr=(int)out;
3593     emit_jmp(0);
3594   }
3595   else if(*adj==0||invert) {
3596     emit_addimm_and_set_flags(CLOCK_DIVIDER*count,HOST_CCREG);
3597     jaddr=(int)out;
3598     emit_jns(0);
3599   }
3600   else
3601   {
3602     emit_cmpimm(HOST_CCREG,-CLOCK_DIVIDER*count);
3603     jaddr=(int)out;
3604     emit_jns(0);
3605   }
3606   add_stub(CC_STUB,jaddr,idle?idle:(int)out,(*adj==0||invert||idle)?0:count,i,addr,taken,0);
3607 }
3608 
do_ccstub(int n)3609 void do_ccstub(int n)
3610 {
3611   int i;
3612   literal_pool(256);
3613   assem_debug("do_ccstub %x\n",start+stubs[n][4]*2);
3614   set_jump_target(stubs[n][1],(pointer)out);
3615   i=stubs[n][4];
3616   if(stubs[n][6]==NODS) {
3617     if(itype[i+1]==LOAD&&rs1[i+1]==rt1[i+1]&&addrmode[i+1]!=DUALIND&&addrmode[i+1]!=GBRIND) {
3618       int hr=get_reg(regs[i].regmap,rs1[i+1]);
3619       if(hr>=0&&((regs[i].wasdoingcp>>hr)&1))
3620       {
3621         emit_movimm(cpmap[i][hr],hr);
3622       }
3623     }
3624     wb_dirtys(regs[i].regmap_entry,regs[i].dirty);
3625   }
3626   else if(stubs[n][6]!=TAKEN) {
3627     wb_dirtys(branch_regs[i].regmap,branch_regs[i].dirty);
3628   }
3629   else {
3630     if(internal_branch(ba[i]))
3631       wb_needed_dirtys(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3632   }
3633   if(stubs[n][5]!=-1)
3634   {
3635     // Save PC as return address
3636     emit_movimm(stubs[n][5],0);
3637     emit_writeword(0,slave?(int)&slave_pc:(int)&master_pc);
3638   }
3639   else
3640   {
3641     // Return address is branch target
3642     if(itype[i]==RJUMP)
3643     {
3644       int r=get_reg(branch_regs[i].regmap,rs1[i]);
3645       if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
3646         r=get_reg(branch_regs[i].regmap,RTEMP);
3647       }
3648       else if(opcode[i]==0&&opcode2[i]==3) {  // BSRF/BRAF
3649         r=get_reg(branch_regs[i].regmap,RTEMP);
3650       }
3651       else if(opcode[i]==0&&opcode2[i]==11&&opcode3[i]==2) {  // RTE
3652         r=get_reg(branch_regs[i].regmap,RTEMP);
3653       }
3654       emit_writeword(r,slave?(int)&slave_pc:(int)&master_pc);
3655     }
3656     else {printf("Unknown branch type in do_ccstub\n");exit(1);}
3657   }
3658   // Update cycle count
3659   if(stubs[n][6]==NODS) assert(regs[i].regmap[HOST_CCREG]==CCREG||regs[i].regmap[HOST_CCREG]==-1);
3660   else assert(branch_regs[i].regmap[HOST_CCREG]==CCREG||branch_regs[i].regmap[HOST_CCREG]==-1);
3661   if(stubs[n][3]) emit_addimm(HOST_CCREG,CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
3662   if(slave) {
3663     emit_load_return_address(SLAVERA_REG);
3664     emit_jmp((pointer)cc_interrupt);
3665   }
3666   else {
3667     emit_call((pointer)slave_entry);
3668   }
3669   if(stubs[n][3]&&stubs[n][6]!=NODS) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
3670   if(stubs[n][6]==TAKEN) {
3671     if(internal_branch(ba[i]))
3672       load_needed_regs(branch_regs[i].regmap,regs[(ba[i]-start)>>1].regmap_entry);
3673     else if(itype[i]==RJUMP) {
3674       if(get_reg(branch_regs[i].regmap,RTEMP)>=0)
3675         emit_readword(slave?(int)&slave_pc:(int)&master_pc,get_reg(branch_regs[i].regmap,RTEMP));
3676       else
3677         emit_loadreg(rs1[i],get_reg(branch_regs[i].regmap,rs1[i]));
3678     }
3679   }else if(stubs[n][6]==NOTTAKEN) {
3680     if(i<slen-2) load_needed_regs(branch_regs[i].regmap,regmap_pre[i+2]);
3681     else load_all_regs(branch_regs[i].regmap);
3682   }else{
3683     if(stubs[n][6]==NODS) {
3684       if(bt[i]||i==0) ccstub_return[i]=(pointer)out;
3685       else {
3686         if(stubs[n][3]) emit_addimm(HOST_CCREG,-CLOCK_DIVIDER*stubs[n][3],HOST_CCREG);
3687         load_all_regs(regs[i].regmap);
3688         load_consts(regmap_pre[i],regs[i].regmap,i);
3689         if(itype[i+1]==LOAD&&rs1[i+1]==rt1[i+1]&&addrmode[i+1]!=DUALIND&&addrmode[i+1]!=GBRIND) {
3690           int hr=get_reg(regs[i].regmap,rs1[i+1]);
3691           if(hr>=0&&((regs[i].wasdoingcp>>hr)&1))
3692           {
3693             #ifdef HOST_IMM_ADDR32
3694             if(!can_direct_read(cpmap[i][hr]+imm[i+1]))
3695             #endif
3696             {
3697               int value=cpmap[i][hr]+imm[i+1];
3698               if(can_direct_read(value)) value=map_address(value);
3699               if((opcode2[i+1]&3)==0) value^=1; // byteswap for little-endian
3700               emit_movimm(value,hr);
3701             }
3702           }
3703         }
3704         ccstub_return[i]=0;
3705       }
3706     }
3707     else load_all_regs(branch_regs[i].regmap);
3708   }
3709   emit_jmp(stubs[n][2]); // return address
3710 }
3711 
add_to_linker(int addr,int target,int ext)3712 void add_to_linker(int addr,int target,int ext)
3713 {
3714   link_addr[linkcount][0]=addr;
3715   link_addr[linkcount][1]=target|slave;
3716   link_addr[linkcount][2]=ext;
3717   linkcount++;
3718 }
3719 
ujump_assemble(int i,struct regstat * i_regs)3720 void ujump_assemble(int i,struct regstat *i_regs)
3721 {
3722   u64 bc_unneeded;
3723   int cc,adj;
3724   signed char *i_regmap=i_regs->regmap;
3725   if(i==(ba[i]-start)>>1) assem_debug("idle loop\n");
3726   address_generation(i+1,i_regs,regs[i].regmap_entry);
3727   #ifdef REG_PREFETCH
3728   int temp=get_reg(branch_regs[i].regmap,PTEMP);
3729   if(rt1[i]==PR&&temp>=0)
3730   {
3731     int return_address=start+i*2+4;
3732     if(get_reg(branch_regs[i].regmap,PR)>0)
3733     if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
3734   }
3735   #endif
3736   if(rt1[i]==PR) {
3737     if(rt1[i+1]==PR||rt2[i+1]==PR) {
3738       // Delay slot abuse, set PR before executing delay slot
3739       int rt;
3740       unsigned int return_address;
3741       rt=get_reg(regs[i].regmap,PR);
3742       return_address=start+i*2+4;
3743       assert(rt>=0);
3744       if(rt>=0) {
3745         #ifdef REG_PREFETCH
3746         if(temp>=0)
3747         {
3748           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
3749         }
3750         #endif
3751         emit_movimm(return_address,rt); // PC into link register
3752       }
3753     }
3754   }
3755   ds_assemble(i+1,i_regs);
3756   bc_unneeded=regs[i].u;
3757   bc_unneeded|=1LL<<rt1[i];
3758   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,
3759                 bc_unneeded);
3760   load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG,CCREG);
3761   if(rt1[i]==PR) {
3762     int rt;
3763     unsigned int return_address;
3764     assert(rs1[i+1]!=PR);
3765     assert(rs2[i+1]!=PR);
3766     assert(rs3[i+1]!=PR);
3767     rt=get_reg(branch_regs[i].regmap,PR);
3768     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
3769     //assert(rt>=0);
3770     return_address=start+i*2+4;
3771     if(rt>=0&&rt1[i+1]!=PR&&rt2[i+1]!=PR) {
3772       #ifdef USE_MINI_HT
3773       if(internal_branch(return_address)) {
3774         int temp=rt+1;
3775         if(temp==EXCLUDE_REG||temp>=HOST_REGS||
3776            branch_regs[i].regmap[temp]>=0)
3777         {
3778           temp=get_reg(branch_regs[i].regmap,-1);
3779         }
3780         #ifdef HOST_TEMPREG
3781         if(temp<0) temp=HOST_TEMPREG;
3782         #endif
3783         if(temp>=0) do_miniht_insert(return_address,rt,temp);
3784         else emit_movimm(return_address,rt);
3785       }
3786       else
3787       #endif
3788       {
3789         #ifdef REG_PREFETCH
3790         if(temp>=0)
3791         {
3792           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
3793         }
3794         #endif
3795         emit_movimm(return_address,rt); // PC into link register
3796         #ifdef IMM_PREFETCH
3797         emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
3798         #endif
3799       }
3800     }
3801   }
3802   cc=get_reg(branch_regs[i].regmap,CCREG);
3803   assert(cc==HOST_CCREG);
3804   store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3805   #ifdef REG_PREFETCH
3806   if(rt1[i]==PR&&temp>=0) emit_prefetchreg(temp);
3807   #endif
3808   do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
3809   if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+cycles[i]+cycles[i+1]-adj),cc);
3810   load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3811   if(internal_branch(ba[i]))
3812     assem_debug("branch: internal\n");
3813   else
3814     assem_debug("branch: external\n");
3815   if(internal_branch(ba[i])&&is_ds[(ba[i]-start)>>1]) {
3816     ds_assemble_entry(i);
3817   }
3818   else {
3819     add_to_linker((int)out,ba[i],internal_branch(ba[i]));
3820     emit_jmp(0);
3821   }
3822 }
3823 
rjump_assemble(int i,struct regstat * i_regs)3824 void rjump_assemble(int i,struct regstat *i_regs)
3825 {
3826   signed char *i_regmap=i_regs->regmap;
3827   int temp;
3828   int rs,cc,adj,rh,ht;
3829   u64 bc_unneeded;
3830   rs=get_reg(branch_regs[i].regmap,rs1[i]);
3831   assert(rs>=0);
3832   if(!((i_regs->wasdoingcp>>rs)&1)) {
3833     if(opcode[i]==0&&opcode2[i]==3) {
3834       // PC-relative branch, put PC in a temporary register
3835       temp=get_reg(branch_regs[i].regmap,RTEMP);
3836       assert(temp>=0);
3837       if(regs[i].regmap[temp]==RTEMP)
3838         emit_movimm(start+i*2+4,temp);
3839     }
3840     if(rs1[i]==rt1[i+1]||rs1[i]==rt2[i+1]) {
3841       // Delay slot abuse, make a copy of the branch address register
3842       temp=get_reg(branch_regs[i].regmap,RTEMP);
3843       assert(temp>=0);
3844       assert(regs[i].regmap[temp]==RTEMP);
3845       if(opcode[i]==0&&opcode2[i]==3)
3846         emit_add(rs,temp,temp);
3847       else
3848         emit_mov(rs,temp);
3849       rs=temp;
3850     }
3851   }
3852   address_generation(i+1,i_regs,regs[i].regmap_entry);
3853   #ifdef REG_PREFETCH
3854   if(rt1[i]==PR)
3855   {
3856     if((temp=get_reg(branch_regs[i].regmap,PTEMP))>=0) {
3857       int return_address=start+i*2+4;
3858       if(i_regmap[temp]==PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
3859     }
3860   }
3861   #endif
3862   #ifdef USE_MINI_HT
3863   if(rs1[i]==PR) {
3864     int rh=get_reg(regs[i].regmap,RHASH);
3865     if(rh>=0) do_preload_rhash(rh);
3866   }
3867   #endif
3868   if(rt1[i]==PR) {
3869     if(rt1[i+1]==PR||rt2[i+1]==PR) {
3870       // Delay slot abuse, set PR before executing delay slot
3871       int rt,return_address;
3872       rt=get_reg(regs[i].regmap,rt1[i]);
3873       assert(rt>=0);
3874       if(rt>=0) {
3875         return_address=start+i*2+4;
3876         #ifdef REG_PREFETCH
3877         if(temp>=0)
3878         {
3879           if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
3880         }
3881         #endif
3882         emit_movimm(return_address,rt); // PC into link register
3883       }
3884     }
3885   }
3886   ds_assemble(i+1,i_regs);
3887   bc_unneeded=regs[i].u;
3888   bc_unneeded|=1LL<<rt1[i];
3889   bc_unneeded&=~(1LL<<rs1[i]);
3890   wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,
3891                 bc_unneeded);
3892   load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i],CCREG,CCREG);
3893   if(rt1[i]==PR) {
3894     int rt,return_address;
3895     assert(rs1[i+1]!=PR);
3896     assert(rs2[i+1]!=PR);
3897     assert(rs3[i+1]!=PR);
3898     rt=get_reg(branch_regs[i].regmap,rt1[i]);
3899     assem_debug("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
3900     if(rt>=0&&rt1[i+1]!=PR&&rt2[i+1]!=PR) {
3901       return_address=start+i*2+4;
3902       #ifdef REG_PREFETCH
3903       if(temp>=0)
3904       {
3905         if(i_regmap[temp]!=PTEMP) emit_movimm((int)hash_table[((return_address>>16)^return_address)&0xFFFF],temp);
3906       }
3907       #endif
3908       emit_movimm(return_address,rt); // PC into link register
3909       #ifdef IMM_PREFETCH
3910       emit_prefetch(hash_table[((return_address>>16)^return_address)&0xFFFF]);
3911       #endif
3912     }
3913   }
3914   cc=get_reg(branch_regs[i].regmap,CCREG);
3915   assert(cc==HOST_CCREG);
3916   #ifdef USE_MINI_HT
3917   rh=get_reg(branch_regs[i].regmap,RHASH);
3918   ht=get_reg(branch_regs[i].regmap,RHTBL);
3919   if(rs1[i]==PR) {
3920     if(regs[i].regmap[rh]!=RHASH) do_preload_rhash(rh);
3921     do_preload_rhtbl(ht);
3922     do_rhash(rs,rh);
3923   }
3924   #endif
3925   if(opcode[i]==0&&opcode2[i]==11&&opcode3[i]==2) {
3926     // Return From Exception (RTE) - pop PC and SR from stack
3927     //printf("RTE\n");
3928     int map=get_reg(branch_regs[i].regmap,MOREG);
3929     int cache=get_reg(branch_regs[i].regmap,MMREG);
3930     int sp=get_reg(branch_regs[i].regmap,15);
3931     int sr=get_reg(branch_regs[i].regmap,SR);
3932     int jaddr=0;
3933     unsigned int hr;
3934     u32 reglist=0;
3935     temp=get_reg(branch_regs[i].regmap,RTEMP);
3936     for(hr=0;hr<HOST_REGS;hr++) {
3937       if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
3938     }
3939     assert(sp>=0);
3940     assert(sr>=0);
3941     assert(temp>=0);
3942     assert(map>=0);
3943     reglist&=~(1<<sr);
3944     reglist&=~(1<<temp);
3945     reglist&=~(1<<map);
3946     map=do_map_r(sp,-1,map,cache,0,-1,-1,0,0);
3947     do_map_r_branch(map,0,0,&jaddr);
3948     // direct load
3949     emit_readword_indexed_map(0,sp,map,temp);
3950     emit_addimm(sp,4,sp);
3951     emit_rorimm(temp,16,temp);
3952     emit_readword_indexed_map(0,sp,map,sr);
3953     emit_addimm(sp,4,sp);
3954     emit_rorimm(sr,16,sr);
3955     assert(jaddr);
3956     add_stub(LOADS_STUB,jaddr,(int)out,i,sp,(int)(&branch_regs[i]),ccadj[i],reglist);
3957     store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
3958     emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+cycles[i]+cycles[i+1]),HOST_CCREG);
3959     add_stub(CC_STUB,(int)out,jump_vaddr_reg[slave][temp],0,i,-1,TAKEN,0);
3960     emit_jns(0);
3961     emit_jmp(jump_vaddr_reg[slave][temp]);
3962   }
3963   else {
3964     if((((i_regs->wasdoingcp>>rs)&1)&&regs[i].regmap[rs]==branch_regs[i].regmap[rs])
3965        ||((i_regs->isconst>>rs1[i])&1)) {
3966       // Do constant propagation, branch to fixed address
3967       u32 constaddr;
3968       if(((i_regs->wasdoingcp>>rs)&1)&&regs[i].regmap[rs]==branch_regs[i].regmap[rs])
3969         constaddr=cpmap[i][rs];
3970       else
3971         constaddr=i_regs->constmap[rs1[i]];
3972       if(opcode[i]==0&&opcode2[i]==3) {
3973         // PC-relative branch, add PC+4
3974         constaddr+=start+i*2+4;
3975       }
3976       assert(ba[i]==constaddr);
3977       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3978       //emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+cycles[i]+cycles[i+1]),HOST_CCREG);
3979       //add_stub(CC_STUB,(int)out,jump_vaddr_reg[rs],0,i,-1,TAKEN,0);
3980       //emit_jns(0);
3981       do_cc(i,branch_regs[i].regmap,&adj,constaddr,TAKEN,0);
3982       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+cycles[i]+cycles[i+1]-adj),cc);
3983       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
3984 
3985       if(internal_branch(constaddr)) assert(bt[(constaddr-start)>>1]);
3986       if(internal_branch(constaddr)&&bt[(constaddr-start)>>1]) {
3987         assem_debug("branch: internal (constant address)\n");
3988         if(is_ds[(constaddr-start)>>1]) {
3989           ds_assemble_entry(i);
3990         }
3991         else {
3992           add_to_linker((int)out,constaddr,1/*internal_branch*/);
3993           emit_jmp(0);
3994         }
3995       }
3996       else
3997       {
3998         assem_debug("branch: external (constant address)\n");
3999         add_to_linker((int)out,constaddr,0/*internal_branch*/);
4000         emit_jmp(0);
4001       }
4002     }
4003     else {
4004       ba[i]=-1;
4005       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4006       #ifdef REG_PREFETCH
4007       if(rt1[i]==PR&&temp>=0) emit_prefetchreg(temp);
4008       #endif
4009       #ifdef USE_MINI_HT
4010       if(rs1[i]==PR) {
4011         do_miniht_load(ht,rh);
4012       }
4013       #endif
4014       //#ifdef HOST_IMM_ADDR32 alternative using lea?
4015       if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
4016         if(opcode[i]==0&&opcode2[i]==3) {
4017           // PC-relative branch, add offset to PC
4018           temp=get_reg(branch_regs[i].regmap,RTEMP);
4019           if(regs[i].regmap[temp]!=RTEMP) {
4020             // Load PC if necessary
4021             emit_movimm(start+i*2+4,temp);
4022           }
4023           emit_add(rs,temp,temp);
4024           rs=temp;
4025         }
4026       }
4027       //do_cc(i,branch_regs[i].regmap,&adj,-1,TAKEN);
4028       //if(adj) emit_addimm(cc,2*(ccadj[i]+2-adj),cc); // ??? - Shouldn't happen
4029       //assert(adj==0);
4030       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+cycles[i]+cycles[i+1]),HOST_CCREG);
4031       add_stub(CC_STUB,(int)out,jump_vaddr_reg[slave][rs],0,i,-1,TAKEN,0);
4032       emit_jns(0);
4033       //load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,-1);
4034       #ifdef USE_MINI_HT
4035       if(rs1[i]==PR) {
4036         do_miniht_jump(rs,rh,ht);
4037       }
4038       else
4039       #endif
4040       {
4041         emit_jmp(jump_vaddr_reg[slave][rs]);
4042       }
4043     }
4044   }
4045   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4046   if(rt1[i]!=PR&&i<slen-2&&(((u32)out)&7)) emit_mov(13,13);
4047   #endif
4048 }
4049 
cjump_assemble(int i,struct regstat * i_regs)4050 void cjump_assemble(int i,struct regstat *i_regs)
4051 {
4052   signed char *i_regmap=i_regs->regmap;
4053   int cc;
4054   int match;
4055   int sr;
4056   int unconditional=0,nop=0;
4057   int adj;
4058   int invert=0;
4059   int internal;
4060   match=match_bt(regs[i].regmap,regs[i].dirty,ba[i]);
4061   assem_debug("match=%d\n",match);
4062   internal=internal_branch(ba[i]);
4063   if(i==(ba[i]-start)>>1) assem_debug("idle loop\n");
4064   if(!match) invert=1;
4065   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4066   if(i>(ba[i]-start)>>1) invert=1;
4067   #endif
4068   sr=get_reg(i_regmap,SR);
4069   assert(sr>=0);
4070   cc=get_reg(i_regmap,CCREG);
4071   assert(cc==HOST_CCREG);
4072   do_cc(i,regs[i].regmap,&adj,start+i*2,NODS,invert);
4073   if(unconditional)
4074     store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4075   if(unconditional) {
4076     do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4077     if(i!=(ba[i]-start)>>1 || source[i+1]!=0) {
4078       if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4079       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4080       if(internal)
4081         assem_debug("branch: internal\n");
4082       else
4083         assem_debug("branch: external\n");
4084       if(internal&&is_ds[(ba[i]-start)>>1]) {
4085         ds_assemble_entry(i);
4086       }
4087       else {
4088         add_to_linker((int)out,ba[i],internal);
4089         emit_jmp(0);
4090       }
4091       #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4092       if(((u32)out)&7) emit_addnop(0);
4093       #endif
4094     }
4095   }
4096   else if(nop) {
4097     int jaddr;
4098     emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
4099     jaddr=(int)out;
4100     emit_jns(0);
4101     add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*2+4,NOTTAKEN,0);
4102   }
4103   else {
4104     pointer taken=0,nottaken=0,nottaken1=0;
4105     //do_cc(i,regs[i].regmap,&adj,-1,0,invert);
4106     if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]-adj),cc);
4107 
4108     //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4109     emit_testimm(sr,1);
4110     if(opcode2[i]==9) // BT
4111     {
4112       if(invert){
4113         nottaken=(pointer)out;
4114         emit_jeq(1);
4115       }else{
4116         add_to_linker((int)out,ba[i],internal);
4117         emit_jne(0);
4118       }
4119     }
4120     if(opcode2[i]==11) // BF
4121     {
4122       if(invert){
4123         nottaken=(pointer)out;
4124         emit_jne(1);
4125       }else{
4126         add_to_linker((int)out,ba[i],internal);
4127         emit_jeq(0);
4128       }
4129     }
4130     if(invert) {
4131       if(taken) set_jump_target(taken,(pointer)out);
4132       #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4133       if(match&&(!internal||!is_ds[(ba[i]-start)>>1])) {
4134         if(adj) {
4135           emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
4136           add_to_linker((int)out,ba[i],internal);
4137         }else{
4138           emit_addnop(13);
4139           add_to_linker((int)out,ba[i],internal*2);
4140         }
4141         emit_jmp(0);
4142       }else
4143       #endif
4144       {
4145         if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
4146         store_regs_bt(regs[i].regmap,regs[i].dirty,ba[i]);
4147         load_regs_bt(regs[i].regmap,regs[i].dirty,ba[i]);
4148         if(internal)
4149           assem_debug("branch: internal\n");
4150         else
4151           assem_debug("branch: external\n");
4152         if(internal&&is_ds[(ba[i]-start)>>1]) {
4153           ds_assemble_entry(i);
4154         }
4155         else {
4156           add_to_linker((int)out,ba[i],internal);
4157           emit_jmp(0);
4158         }
4159       }
4160       set_jump_target(nottaken,(pointer)out);
4161     }
4162 
4163     //if(nottaken1) set_jump_target(nottaken1,(int)out);
4164     if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
4165   } // (!unconditional)
4166 }
4167 
sjump_assemble(int i,struct regstat * i_regs)4168 void sjump_assemble(int i,struct regstat *i_regs)
4169 {
4170   signed char *i_regmap=i_regs->regmap;
4171   int cc;
4172   int adj;
4173   int match;
4174   int sr;
4175   int unconditional=0,nop=0;
4176   int invert=0;
4177   int internal=internal_branch(ba[i]);
4178   match=match_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4179   assem_debug("match=%d\n",match);
4180   internal=internal_branch(ba[i]);
4181   if(i==(ba[i]-start)>>1) assem_debug("idle loop\n");
4182   if(!match) invert=1;
4183   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4184   if(i>(ba[i]-start)>>1) invert=1;
4185   #endif
4186 
4187   if(ooo[i]) {
4188     sr=get_reg(branch_regs[i].regmap,SR);
4189   }
4190   else {
4191     sr=get_reg(i_regmap,SR);
4192   }
4193 
4194   cc=get_reg(i_regmap,CCREG);
4195   assert(cc==HOST_CCREG);
4196 
4197   if(ooo[i]) {
4198     u64 bc_unneeded;
4199     // Out of order execution (delay slot first)
4200     //printf("OOOE\n");
4201     do_cc(i,regs[i].regmap,&adj,start+i*2,NODS,invert);
4202     address_generation(i+1,i_regs,regs[i].regmap_entry);
4203     ds_assemble(i+1,i_regs);
4204     bc_unneeded=regs[i].u;
4205     bc_unneeded&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
4206     wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,
4207                   bc_unneeded);
4208     load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,SR,SR);
4209     cc=get_reg(branch_regs[i].regmap,CCREG);
4210     assert(cc==HOST_CCREG);
4211     if(unconditional)
4212       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4213     if(unconditional) {
4214       do_cc(i,branch_regs[i].regmap,&adj,ba[i],TAKEN,0);
4215       if(i!=(ba[i]-start)>>1 || source[i+1]!=0) {
4216         if(adj) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+2-adj),cc);
4217         load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4218         if(internal)
4219           assem_debug("branch: internal\n");
4220         else
4221           assem_debug("branch: external\n");
4222         if(internal&&is_ds[(ba[i]-start)>>1]) {
4223           ds_assemble_entry(i);
4224         }
4225         else {
4226           add_to_linker((int)out,ba[i],internal);
4227           emit_jmp(0);
4228         }
4229         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4230         if(((u32)out)&7) emit_addnop(0);
4231         #endif
4232       }
4233     }
4234     else if(nop) {
4235       int jaddr;
4236       emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+2),cc);
4237       jaddr=(int)out;
4238       emit_jns(0);
4239       add_stub(CC_STUB,jaddr,(int)out,0,i,start+i*2+4,NOTTAKEN,0);
4240     }
4241     else {
4242       pointer taken=0,nottaken=0,nottaken1=0;
4243       //do_cc(i,branch_regs[i].regmap,&adj,-1,0,invert);
4244       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]-adj),cc);
4245 
4246       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4247       assert(sr>=0);
4248       emit_testimm(sr,1);
4249       if(opcode2[i]==13) // BT/S
4250       {
4251         if(invert){
4252           nottaken=(pointer)out;
4253           emit_jeq(1);
4254         }else{
4255           add_to_linker((int)out,ba[i],internal);
4256           emit_jne(0);
4257         }
4258       }
4259       if(opcode2[i]==15) // BF/S
4260       {
4261         if(invert){
4262           nottaken=(pointer)out;
4263           emit_jne(1);
4264         }else{
4265           add_to_linker((int)out,ba[i],internal);
4266           emit_jeq(0);
4267         }
4268       }
4269       if(invert) {
4270         if(taken) set_jump_target(taken,(pointer)out);
4271         #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
4272         if(match&&(!internal||!is_ds[(ba[i]-start)>>1])) {
4273           if(adj) {
4274             emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
4275             add_to_linker((int)out,ba[i],internal);
4276           }else{
4277             emit_addnop(13);
4278             add_to_linker((int)out,ba[i],internal*2);
4279           }
4280           emit_jmp(0);
4281         }else
4282         #endif
4283         {
4284           if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
4285           store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4286           load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4287           if(internal)
4288             assem_debug("branch: internal\n");
4289           else
4290             assem_debug("branch: external\n");
4291           if(internal&&is_ds[(ba[i]-start)>>1]) {
4292             ds_assemble_entry(i);
4293           }
4294           else {
4295             add_to_linker((int)out,ba[i],internal);
4296             emit_jmp(0);
4297           }
4298         }
4299         set_jump_target(nottaken,(pointer)out);
4300       }
4301 
4302       if(nottaken1) set_jump_target(nottaken1,(pointer)out);
4303       if(adj&&!invert) emit_addimm(cc,CLOCK_DIVIDER*adj,cc);
4304     } // (!unconditional)
4305   } // if(ooo)
4306   else
4307   {
4308     // In-order execution (branch first)
4309     //printf("IOE\n");
4310     u64 ds_unneeded;
4311     pointer taken=0,nottaken=0,nottaken1=0;
4312     do_cc(i,regs[i].regmap,&adj,start+i*2,NODS,1);
4313     if(!unconditional&&!nop) {
4314       //printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
4315       assert(sr>=0);
4316       emit_testimm(sr,1);
4317       if(opcode2[i]==13) // BT/S
4318       {
4319         nottaken=(pointer)out;
4320         emit_jeq(2);
4321       }
4322       if(opcode2[i]==15) // BF/S
4323       {
4324         nottaken=(pointer)out;
4325         emit_jne(2);
4326       }
4327     } // if(!unconditional)
4328     ds_unneeded=regs[i].u;
4329     ds_unneeded&=~((1LL<<rs1[i+1])|(1LL<<rs2[i+1])|(1LL<<rs3[i+1]));
4330     // branch taken
4331     if(!nop) {
4332       if(taken) set_jump_target(taken,(int)out);
4333       assem_debug("1:\n");
4334       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,
4335                     ds_unneeded);
4336       // load regs
4337       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1],rs3[i+1]);
4338       address_generation(i+1,&branch_regs[i],0);
4339       if(itype[i+1]==COMPLEX) {
4340         if((opcode[i+1]|4)==4&&opcode2[i+1]==15) { // MAC.W/MAC.L
4341           load_regs(regs[i].regmap,branch_regs[i].regmap,MACL,MACH,MACH);
4342         }
4343       }
4344       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG,CCREG);
4345       ds_assemble(i+1,&branch_regs[i]);
4346       cc=get_reg(branch_regs[i].regmap,CCREG);
4347       if(cc==-1) {
4348         emit_loadreg(CCREG,cc=HOST_CCREG);
4349         // CHECK: Is the following instruction (fall thru) allocated ok?
4350       }
4351       assert(cc==HOST_CCREG);
4352       store_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4353       //do_cc(i,i_regmap,&adj,ba[i],TAKEN,0);
4354       assem_debug("cycle count (adj)\n");
4355       /*if(adj)*/ //emit_addimm(cc,CLOCK_DIVIDER*(ccadj[i]+cycles[i]+cycles[i+1]-adj),cc);
4356       if(adj) emit_addimm(cc,-CLOCK_DIVIDER*adj,cc);
4357       load_regs_bt(branch_regs[i].regmap,branch_regs[i].dirty,ba[i]);
4358       if(internal)
4359         assem_debug("branch: internal\n");
4360       else
4361         assem_debug("branch: external\n");
4362       if(internal&&is_ds[(ba[i]-start)>>1]) {
4363         ds_assemble_entry(i);
4364       }
4365       else {
4366         add_to_linker((int)out,ba[i],internal);
4367         emit_jmp(0);
4368       }
4369     }
4370     // branch not taken
4371     if(!unconditional) {
4372       if(nottaken1) set_jump_target(nottaken1,(int)out);
4373       set_jump_target(nottaken,(int)out);
4374       assem_debug("2:\n");
4375       wb_invalidate(regs[i].regmap,branch_regs[i].regmap,regs[i].dirty,
4376                     ds_unneeded);
4377       load_regs(regs[i].regmap,branch_regs[i].regmap,rs1[i+1],rs2[i+1],rs3[i+1]);
4378       address_generation(i+1,&branch_regs[i],0);
4379       if(itype[i+1]==COMPLEX) {
4380         if((opcode[i+1]|4)==4&&opcode2[i+1]==15) { // MAC.W/MAC.L
4381           load_regs(regs[i].regmap,branch_regs[i].regmap,MACL,MACH,MACH);
4382         }
4383       }
4384       load_regs(regs[i].regmap,branch_regs[i].regmap,CCREG,CCREG,CCREG);
4385       ds_assemble(i+1,&branch_regs[i]);
4386     }
4387   }
4388 }
4389 
system_assemble(int i,struct regstat * i_regs)4390 void system_assemble(int i,struct regstat *i_regs)
4391 {
4392   signed char ccreg=get_reg(i_regs->regmap,CCREG);
4393   assert(ccreg==HOST_CCREG);
4394   assert(!is_delayslot);
4395   if(opcode[i]==0&&opcode2[i]==11&&opcode3[i]==1) { // SLEEP
4396     pointer jaddr, return_address;
4397     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
4398     jaddr=(pointer)out;
4399     emit_jns(0);
4400     return_address=(pointer)out;
4401     emit_zeroreg(HOST_CCREG);
4402     set_jump_target(jaddr,(pointer)out);
4403     add_stub(CC_STUB,(int)out,return_address,0,i,start+i*2,TAKEN,0);
4404     emit_jmp(0);
4405     // DEBUG: Count in multiples of three to match interpreter
4406     //emit_addimm_and_set_flags(CLOCK_DIVIDER*3,HOST_CCREG);
4407     //add_stub(CC_STUB,(int)out,return_address,0,i,start+i*2,TAKEN,0);
4408     //emit_jns(0);
4409     emit_jmp(return_address);
4410   }
4411   else {
4412     int b,t,sr,st,map=-1,cache=-1;
4413     int jaddr=0;
4414     unsigned int hr;
4415     u32 reglist=0;
4416     assert(opcode[i]==12); // TRAPA
4417     t=get_reg(i_regs->regmap,-1);
4418     b=get_reg(i_regs->regmap,VBR);
4419     sr=get_reg(i_regs->regmap,SR);
4420     st=get_reg(i_regs->regmap,15); // STACK
4421     for(hr=0;hr<HOST_REGS;hr++) {
4422       if(i_regs->regmap[hr]>=0) reglist|=1<<hr;
4423     }
4424     assert(t>=0);
4425     assert(b>=0);
4426     assert(sr>=0);
4427     assert(st>=0);
4428     emit_addimm(st,-4,st);
4429     map=get_reg(i_regs->regmap,MOREG);
4430     cache=get_reg(i_regs->regmap,MMREG);
4431     assert(map>=0);
4432     reglist&=~(1<<map);
4433     map=do_map_w(st,st,map,cache,0,0,0);
4434     do_map_w_branch(map,0,0,&jaddr);
4435     // Save SR
4436     emit_rorimm(sr,16,sr);
4437     emit_writeword_indexed_map(sr,0,st,map,map);
4438     emit_rorimm(sr,16,sr);
4439     if(jaddr) {
4440       add_stub(STOREL_STUB,jaddr,(int)out,i,st,(int)i_regs,ccadj[i],reglist);
4441     }
4442     emit_addimm(st,-4,st);
4443     store_regs_bt(i_regs->regmap,i_regs->dirty,-1);
4444     emit_movimm(start+i*2+2,sr);
4445     emit_addimm(b,imm[i]<<2,b);
4446     map=do_map_w(st,st,map,cache,0,0,0);
4447     do_map_w_branch(map,0,0,&jaddr);
4448     // Save PC
4449     emit_rorimm(sr,16,sr);
4450     emit_writeword_indexed_map(sr,0,st,map,map);
4451     if(jaddr) {
4452       add_stub(STOREL_STUB,jaddr,(int)out,i,st,(int)i_regs,ccadj[i],reglist);
4453     }
4454     // Load PC
4455     map=do_map_r(b,b,map,cache,0,-1,-1,0,0);
4456     do_map_r_branch(map,0,0,&jaddr);
4457     emit_readword_indexed_map(0,b,map,t);
4458     emit_rorimm(t,16,t);
4459     if(jaddr)
4460       add_stub(LOADL_STUB,jaddr,(int)out,i,t,(int)i_regs,ccadj[i],reglist);
4461     if(i_regs->regmap[HOST_CCREG]!=CCREG) {
4462       emit_loadreg(CCREG,HOST_CCREG);
4463     }
4464     emit_addimm_and_set_flags(CLOCK_DIVIDER*(ccadj[i]+cycles[i]),HOST_CCREG);
4465     //add_stub(CC_STUB,(int)out,jump_vaddr_reg[slave][t],0,i,-1,TAKEN,0); // FIXME
4466     //emit_jns(0);
4467     emit_jmp(jump_vaddr_reg[slave][t]);
4468   }
4469 }
4470 
bios_assemble(int i,struct regstat * i_regs)4471 void bios_assemble(int i,struct regstat *i_regs)
4472 {
4473   signed char ccreg=get_reg(i_regs->regmap,CCREG);
4474   assert(ccreg==HOST_CCREG);
4475   assert(!is_delayslot);
4476   emit_movimm(start+i*2,0);
4477   //emit_writeword(0,slave?(int)&slave_pc:(int)&master_pc);
4478   emit_addimm(HOST_CCREG,CLOCK_DIVIDER*ccadj[i],HOST_CCREG);
4479   if(slave)
4480     emit_call((pointer)slave_handle_bios); // Probably doesn't work
4481   else
4482     emit_call((pointer)master_handle_bios);
4483 }
4484 
4485 // Basic liveness analysis for SH2 registers
unneeded_registers(int istart,int iend,int r)4486 void unneeded_registers(int istart,int iend,int r)
4487 {
4488   int i;
4489   u64 u,uu,b,bu;
4490   u64 temp_u,temp_uu;
4491   u64 tdep;
4492   if(iend==slen-1) {
4493     u=0;
4494   }else{
4495     u=unneeded_reg[iend+1];
4496     u=0;
4497   }
4498   for (i=iend;i>=istart;i--)
4499   {
4500     //printf("unneeded registers i=%d (%d,%d) r=%d\n",i,istart,iend,r);
4501     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
4502     {
4503       if(ba[i]<start || ba[i]>=(start+slen*2))
4504       {
4505         // Branch out of this block, flush all regs
4506         u=0;
4507         branch_unneeded_reg[i]=u;
4508         if(itype[i]!=CJUMP) {
4509           // Merge in delay slot
4510           if(rt1[i+1]>=0) u|=1LL<<rt1[i+1];
4511           if(rt2[i+1]>=0) u|=1LL<<rt2[i+1];
4512           if(rs1[i+1]>=0) u&=~(1LL<<rs1[i+1]);
4513           if(rs2[i+1]>=0) u&=~(1LL<<rs2[i+1]);
4514           if(rs3[i+1]>=0) u&=~(1LL<<rs3[i+1]);
4515         }
4516       }
4517       else
4518       {
4519         if(ba[i]<=start+i*2) {
4520           // Backward branch
4521           if(itype[i]==UJUMP||itype[i]==RJUMP)
4522           {
4523             // Unconditional branch
4524             temp_u=0;
4525           } else if(itype[i]==CJUMP) {
4526             // Conditional branch (not taken case)
4527             temp_u=unneeded_reg[i+1];
4528           } else {
4529             // Conditional branch (not taken case)
4530             temp_u=unneeded_reg[i+2];
4531           }
4532           if(itype[i]!=CJUMP) {
4533             // Merge in delay slot
4534             if(rt1[i+1]>=0) temp_u|=1LL<<rt1[i+1];
4535             if(rt2[i+1]>=0) temp_u|=1LL<<rt2[i+1];
4536             if(rs1[i+1]>=0) temp_u&=~(1LL<<rs1[i+1]);
4537             if(rs2[i+1]>=0) temp_u&=~(1LL<<rs2[i+1]);
4538             if(rs3[i+1]>=0) temp_u&=~(1LL<<rs3[i+1]);
4539           }
4540           if(rt1[i]>=0) temp_u|=1LL<<rt1[i];
4541           if(rt2[i]>=0) temp_u|=1LL<<rt2[i];
4542           if(rs1[i]>=0) temp_u&=~(1LL<<rs1[i]);
4543           if(rs2[i]>=0) temp_u&=~(1LL<<rs2[i]);
4544           if(rs3[i]>=0) temp_u&=~(1LL<<rs3[i]);
4545           unneeded_reg[i]=temp_u;
4546           // Only go three levels deep.  This recursion can take an
4547           // excessive amount of time if there are a lot of nested loops.
4548           if(r<2) {
4549             unneeded_registers((ba[i]-start)>>1,i-1,r+1);
4550           }else{
4551             unneeded_reg[(ba[i]-start)>>1]=0;
4552           }
4553         } /*else*/ if(1) {
4554           if(itype[i]==UJUMP||itype[i]==RJUMP)
4555           {
4556             // Unconditional branch
4557             u=unneeded_reg[(ba[i]-start)>>1];
4558             // Always need stack and status in case of interrupt
4559             u&=~((1LL<<15)|(1LL<<SR));
4560             branch_unneeded_reg[i]=u;
4561         //u=0; // for debugging
4562         //branch_unneeded_reg[i]=u; // for debugging
4563             // Merge in delay slot
4564             if(rt1[i+1]>=0) u|=1LL<<rt1[i+1];
4565             if(rt2[i+1]>=0) u|=1LL<<rt2[i+1];
4566             if(rs1[i+1]>=0) u&=~(1LL<<rs1[i+1]);
4567             if(rs2[i+1]>=0) u&=~(1LL<<rs2[i+1]);
4568             if(rs3[i+1]>=0) u&=~(1LL<<rs3[i+1]);
4569           } else {
4570             // Conditional branch
4571             b=unneeded_reg[(ba[i]-start)>>1];
4572             branch_unneeded_reg[i]=b;
4573         //b=0; // for debugging
4574         //branch_unneeded_reg[i]=b; // for debugging
4575             // Branch delay slot
4576             if(itype[i]!=CJUMP) {
4577               if(rt1[i+1]>=0) b|=1LL<<rt1[i+1];
4578               if(rt2[i+1]>=0) b|=1LL<<rt2[i+1];
4579               if(rs1[i+1]>=0) b&=~(1LL<<rs1[i+1]);
4580               if(rs2[i+1]>=0) b&=~(1LL<<rs2[i+1]);
4581               if(rs3[i+1]>=0) b&=~(1LL<<rs3[i+1]);
4582             }
4583             u&=b;
4584             // Always need stack and status in case of interrupt
4585             u&=~((1LL<<15)|(1LL<<SR));
4586         //u=0; // for debugging
4587             if(itype[i]!=CJUMP) {
4588               if(i<slen-1) {
4589                 branch_unneeded_reg[i]&=unneeded_reg[i+2];
4590               } else {
4591                 branch_unneeded_reg[i]=0;
4592               }
4593             }else{
4594               if(i<slen) {
4595                 branch_unneeded_reg[i]&=unneeded_reg[i+1];
4596               } else {
4597                 branch_unneeded_reg[i]=0;
4598               }
4599             }
4600         //branch_unneeded_reg[i]=0; // for debugging
4601           }
4602         }
4603       }
4604     }
4605     else if(itype[i]==RJUMP && source[i]==0x2b)
4606     {
4607       // RTE instruction (return from exception)
4608       u=(1<<SR);
4609     }
4610     else if(itype[i]==SYSTEM && opcode[i]==12)
4611     {
4612       // TRAPA instruction (syscall)
4613       u=0;
4614     }
4615     //u=uu=0; // DEBUG
4616     //tdep=(~uu>>rt1[i])&1;
4617     // Written registers are unneeded
4618     if(rt1[i]>=0) u|=1LL<<rt1[i];
4619     if(rt2[i]>=0) u|=1LL<<rt2[i];
4620     // Accessed registers are needed
4621     if(rs1[i]>=0) u&=~(1LL<<rs1[i]);
4622     if(rs2[i]>=0) u&=~(1LL<<rs2[i]);
4623     if(rs3[i]>=0) u&=~(1LL<<rs3[i]);
4624     // Source-target dependencies
4625     //uu&=~(tdep<<dep1[i]);
4626     //uu&=~(tdep<<dep2[i]);
4627     if(u&(1<<SR)) u|=(1<<TBIT);
4628     // Save it
4629     unneeded_reg[i]=u;
4630   }
4631 }
4632 
4633 // Write back dirty registers as soon as we will no longer modify them,
4634 // so that we don't end up with lots of writes at the branches.
clean_registers(int istart,int iend,int wr)4635 void clean_registers(int istart,int iend,int wr)
4636 {
4637   int i;
4638   int r;
4639   u32 will_dirty_i,will_dirty_next,temp_will_dirty;
4640   u32 wont_dirty_i,wont_dirty_next,temp_wont_dirty;
4641   if(iend==slen-1) {
4642     will_dirty_i=will_dirty_next=0;
4643     wont_dirty_i=wont_dirty_next=0;
4644   }else{
4645     will_dirty_i=will_dirty_next=will_dirty[iend+1];
4646     wont_dirty_i=wont_dirty_next=wont_dirty[iend+1];
4647   }
4648   for (i=iend;i>=istart;i--)
4649   {
4650     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
4651     {
4652       if(ba[i]<start || ba[i]>=(start+slen*2))
4653       {
4654         // Branch out of this block, flush all regs
4655         if(itype[i]==RJUMP||itype[i]==UJUMP)
4656         {
4657           // Unconditional branch
4658           will_dirty_i=0;
4659           wont_dirty_i=0;
4660           // Merge in delay slot (will dirty)
4661           for(r=0;r<HOST_REGS;r++) {
4662             if(r!=EXCLUDE_REG) {
4663               if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4664               if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4665               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4666               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4667               if((branch_regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4668               if(branch_regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4669               if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4670               if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4671               if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4672               if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4673               if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4674               if((regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4675               if(regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4676               if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4677               if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4678                 if(regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4679                 if(branch_regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4680               }
4681             }
4682           }
4683         }
4684         else
4685         {
4686           // Conditional branch
4687           will_dirty_i=0;
4688           wont_dirty_i=wont_dirty_next;
4689           // Merge in delay slot (will dirty)
4690           for(r=0;r<HOST_REGS;r++) {
4691             if(r!=EXCLUDE_REG) {
4692               if(itype[i]==SJUMP) {
4693                 // Only conditional branches with delay slots
4694                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4695                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4696                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4697                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4698                 if((branch_regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4699                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4700                 //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4701                 //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4702                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4703                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4704                 if((regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4705                 if(regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4706                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4707                 if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4708                   if(regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4709                   if(branch_regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4710                 }
4711               }
4712             }
4713           }
4714         }
4715         // Merge in delay slot (wont dirty)
4716         for(r=0;r<HOST_REGS;r++) {
4717           if(r!=EXCLUDE_REG) {
4718             if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
4719             if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
4720             if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
4721             if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
4722             if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
4723             if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
4724             if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
4725             if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
4726             if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
4727             if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
4728             if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4729               if(regs[i].regmap[r]==SR) wont_dirty_i|=1<<r;
4730               if(branch_regs[i].regmap[r]==SR) wont_dirty_i|=1<<r;
4731             }
4732             if(opcode[i]==0&&opcode2[i]==11&&opcode3[i]==2)
4733             {
4734               // RTE instruction (return from interrupt)
4735               if(regs[i].regmap[r]==15||branch_regs[i].regmap[r]==15) {
4736                 wont_dirty_i|=1<<r;
4737                 will_dirty_i|=1<<r;
4738               }
4739               if(regs[i].regmap[r]==SR||branch_regs[i].regmap[r]==SR) {
4740                 wont_dirty_i|=1<<r;
4741                 will_dirty_i|=1<<r;
4742               }
4743             }
4744           }
4745         }
4746         if(wr) {
4747           //#ifndef DESTRUCTIVE_WRITEBACK
4748           branch_regs[i].dirty&=wont_dirty_i;
4749           //#endif
4750           branch_regs[i].dirty|=will_dirty_i;
4751         }
4752       }
4753       else
4754       {
4755         // Internal branch
4756         if(ba[i]<=start+i*2) {
4757           // Recursively evaluate backward branches
4758           if(itype[i]==RJUMP||itype[i]==UJUMP)
4759           {
4760             // Unconditional branch
4761             temp_will_dirty=0;
4762             temp_wont_dirty=0;
4763             // Merge in delay slot (will dirty)
4764             for(r=0;r<HOST_REGS;r++) {
4765               if(r!=EXCLUDE_REG) {
4766                 if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
4767                 if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
4768                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
4769                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
4770                 if((branch_regs[i].regmap[r]&63)>TBIT) temp_will_dirty&=~(1<<r);
4771                 if(branch_regs[i].regmap[r]<0) temp_will_dirty&=~(1<<r);
4772                 if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
4773                 if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
4774                 if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
4775                 if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
4776                 if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
4777                 if((regs[i].regmap[r]&63)>TBIT) temp_will_dirty&=~(1<<r);
4778                 if(regs[i].regmap[r]<0) temp_will_dirty&=~(1<<r);
4779                 if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
4780                 if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4781                   if(regs[i].regmap[r]==SR) temp_will_dirty|=1<<r;
4782                   if(branch_regs[i].regmap[r]==SR) temp_will_dirty|=1<<r;
4783                 }
4784               }
4785             }
4786           } else {
4787             // Conditional branch (not taken case)
4788             temp_will_dirty=will_dirty_next;
4789             temp_wont_dirty=wont_dirty_next;
4790             // Merge in delay slot (will dirty)
4791             for(r=0;r<HOST_REGS;r++) {
4792               if(r!=EXCLUDE_REG) {
4793                 if(itype[i]==SJUMP) {
4794                   // Only /S instructions have a delay slot
4795                   if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
4796                   if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
4797                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
4798                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
4799                   if((branch_regs[i].regmap[r]&63)>TBIT) temp_will_dirty&=~(1<<r);
4800                   if(branch_regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
4801                   //if((regs[i].regmap[r]&63)==rt1[i]) temp_will_dirty|=1<<r;
4802                   //if((regs[i].regmap[r]&63)==rt2[i]) temp_will_dirty|=1<<r;
4803                   if((regs[i].regmap[r]&63)==rt1[i+1]) temp_will_dirty|=1<<r;
4804                   if((regs[i].regmap[r]&63)==rt2[i+1]) temp_will_dirty|=1<<r;
4805                   if((regs[i].regmap[r]&63)>TBIT) temp_will_dirty&=~(1<<r);
4806                   if(regs[i].regmap[r]<0) temp_will_dirty&=~(1<<r);
4807                   if(regs[i].regmap[r]==CCREG) temp_will_dirty|=1<<r;
4808                   if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4809                     if(regs[i].regmap[r]==SR) temp_will_dirty|=1<<r;
4810                     if(branch_regs[i].regmap[r]==SR) temp_will_dirty|=1<<r;
4811                   }
4812                 }
4813               }
4814             }
4815           }
4816           // Merge in delay slot (wont dirty)
4817           for(r=0;r<HOST_REGS;r++) {
4818             if(r!=EXCLUDE_REG) {
4819               if((regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
4820               if((regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
4821               if((regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
4822               if((regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
4823               if(regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
4824               if((branch_regs[i].regmap[r]&63)==rt1[i]) temp_wont_dirty|=1<<r;
4825               if((branch_regs[i].regmap[r]&63)==rt2[i]) temp_wont_dirty|=1<<r;
4826               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) temp_wont_dirty|=1<<r;
4827               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) temp_wont_dirty|=1<<r;
4828               if(branch_regs[i].regmap[r]==CCREG) temp_wont_dirty|=1<<r;
4829               if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4830                 if(regs[i].regmap[r]==SR) temp_wont_dirty|=1<<r;
4831                 if(branch_regs[i].regmap[r]==SR) temp_wont_dirty|=1<<r;
4832               }
4833             }
4834           }
4835           // Deal with changed mappings
4836           if(i<iend) {
4837             for(r=0;r<HOST_REGS;r++) {
4838               if(r!=EXCLUDE_REG) {
4839                 if(regs[i].regmap[r]!=regmap_pre[i][r]) {
4840                   temp_will_dirty&=~(1<<r);
4841                   temp_wont_dirty&=~(1<<r);
4842                   if((regmap_pre[i][r]&63)>=0 && (regmap_pre[i][r]&63)<TBIT) {
4843                     temp_will_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
4844                     temp_wont_dirty|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
4845                   } else {
4846                     temp_will_dirty|=1<<r;
4847                     temp_wont_dirty|=1<<r;
4848                   }
4849                 }
4850               }
4851             }
4852           }
4853           if(wr) {
4854             will_dirty[i]=temp_will_dirty;
4855             wont_dirty[i]=temp_wont_dirty;
4856             clean_registers((ba[i]-start)>>1,i-1,0);
4857           }else{
4858             // Limit recursion.  It can take an excessive amount
4859             // of time if there are a lot of nested loops.
4860             will_dirty[(ba[i]-start)>>1]=0;
4861             wont_dirty[(ba[i]-start)>>1]=-1;
4862           }
4863         }
4864         /*else*/ if(1)
4865         {
4866           if(itype[i]==RJUMP||itype[i]==UJUMP)
4867           {
4868             // Unconditional branch
4869             will_dirty_i=0;
4870             wont_dirty_i=0;
4871           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
4872             for(r=0;r<HOST_REGS;r++) {
4873               if(r!=EXCLUDE_REG) {
4874                 if(branch_regs[i].regmap[r]==regs[(ba[i]-start)>>1].regmap_entry[r]) {
4875                   will_dirty_i|=will_dirty[(ba[i]-start)>>1]&(1<<r);
4876                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>1]&(1<<r);
4877                 }
4878                 if(branch_regs[i].regmap[r]>=0) {
4879                   will_dirty_i|=((unneeded_reg[(ba[i]-start)>>1]>>branch_regs[i].regmap[r])&1)<<r;
4880                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>1]>>branch_regs[i].regmap[r])&1)<<r;
4881                 }
4882               }
4883             }
4884           //}
4885             // Merge in delay slot
4886             for(r=0;r<HOST_REGS;r++) {
4887               if(r!=EXCLUDE_REG) {
4888                 if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4889                 if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4890                 if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4891                 if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4892                 if((branch_regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4893                 if(branch_regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4894                 if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4895                 if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4896                 if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4897                 if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4898                 if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4899                 if((regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4900                 if(regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4901                 if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4902                 if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4903                   if(regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4904                   if(branch_regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4905                 }
4906               }
4907             }
4908           } else {
4909             // Conditional branch
4910             will_dirty_i=will_dirty_next;
4911             wont_dirty_i=wont_dirty_next;
4912           //if(ba[i]>start+i*4) { // Disable recursion (for debugging)
4913             for(r=0;r<HOST_REGS;r++) {
4914               if(r!=EXCLUDE_REG) {
4915                 signed char target_reg=(itype[i]==CJUMP)?regs[i].regmap[r]:branch_regs[i].regmap[r];
4916                 if(target_reg==regs[(ba[i]-start)>>1].regmap_entry[r]) {
4917                   will_dirty_i&=will_dirty[(ba[i]-start)>>1]&(1<<r);
4918                   wont_dirty_i|=wont_dirty[(ba[i]-start)>>1]&(1<<r);
4919                 }
4920                 else if(target_reg>=0) {
4921                   will_dirty_i&=((unneeded_reg[(ba[i]-start)>>1]>>target_reg)&1)<<r;
4922                   wont_dirty_i|=((unneeded_reg[(ba[i]-start)>>1]>>target_reg)&1)<<r;
4923                 }
4924               }
4925             }
4926           //}
4927             // Merge in delay slot
4928             for(r=0;r<HOST_REGS;r++) {
4929               if(r!=EXCLUDE_REG) {
4930                 if(itype[i]==SJUMP) {
4931                   // Only /S branches have delay slots
4932                   if((branch_regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4933                   if((branch_regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4934                   if((branch_regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4935                   if((branch_regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4936                   if((branch_regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4937                   if(branch_regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4938                   if(branch_regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4939                   //if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4940                   //if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4941                   if((regs[i].regmap[r]&63)==rt1[i+1]) will_dirty_i|=1<<r;
4942                   if((regs[i].regmap[r]&63)==rt2[i+1]) will_dirty_i|=1<<r;
4943                   if((regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4944                   if(regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4945                   if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4946                   if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4947                     if(regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4948                     if(branch_regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4949                   }
4950                 }
4951               }
4952             }
4953           }
4954           // Merge in delay slot (won't dirty)
4955           for(r=0;r<HOST_REGS;r++) {
4956             if(r!=EXCLUDE_REG) {
4957               if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
4958               if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
4959               if((regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
4960               if((regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
4961               if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
4962               if((branch_regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
4963               if((branch_regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
4964               if((branch_regs[i].regmap[r]&63)==rt1[i+1]) wont_dirty_i|=1<<r;
4965               if((branch_regs[i].regmap[r]&63)==rt2[i+1]) wont_dirty_i|=1<<r;
4966               if(branch_regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
4967               if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
4968                 if(regs[i].regmap[r]==SR) wont_dirty_i|=1<<r;
4969                 if(branch_regs[i].regmap[r]==SR) wont_dirty_i|=1<<r;
4970               }
4971             }
4972           }
4973           if(wr) {
4974             //#ifndef DESTRUCTIVE_WRITEBACK
4975             branch_regs[i].dirty&=wont_dirty_i;
4976             //#endif
4977             branch_regs[i].dirty|=will_dirty_i;
4978           }
4979         }
4980       }
4981     }
4982     else if(itype[i]==SYSCALL) // FIXME
4983     {
4984       // SYSCALL instruction (software interrupt)
4985       will_dirty_i=0;
4986       wont_dirty_i=0;
4987     }
4988     will_dirty_next=will_dirty_i;
4989     wont_dirty_next=wont_dirty_i;
4990     for(r=0;r<HOST_REGS;r++) {
4991       if(r!=EXCLUDE_REG) {
4992         if((regs[i].regmap[r]&63)==rt1[i]) will_dirty_i|=1<<r;
4993         if((regs[i].regmap[r]&63)==rt2[i]) will_dirty_i|=1<<r;
4994         if(rt1[i]==TBIT||rt2[i]==TBIT)
4995           if(regs[i].regmap[r]==SR) will_dirty_i|=1<<r;
4996         if((regs[i].regmap[r]&63)>TBIT) will_dirty_i&=~(1<<r);
4997         if(regs[i].regmap[r]<0) will_dirty_i&=~(1<<r);
4998         if(regs[i].regmap[r]==CCREG) will_dirty_i|=1<<r;
4999         if((regs[i].regmap[r]&63)==rt1[i]) wont_dirty_i|=1<<r;
5000         if((regs[i].regmap[r]&63)==rt2[i]) wont_dirty_i|=1<<r;
5001         if(rt1[i]==TBIT||rt2[i]==TBIT)
5002           if(regs[i].regmap[r]==SR) wont_dirty_i|=1<<r;
5003         if(regs[i].regmap[r]==CCREG) wont_dirty_i|=1<<r;
5004         if(itype[i]==COMPLEX)
5005         {
5006           if((opcode[i]|4)==4&&opcode2[i]==15) { // MAC.L/MAC.W
5007             if(regs[i].regmap[r]==MACL||regs[i].regmap[r]==MACH) {
5008               wont_dirty_i|=1<<r;
5009               will_dirty_i|=1<<r;
5010             }
5011           }
5012         }
5013         if(i>istart) {
5014           if(itype[i]!=RJUMP&&itype[i]!=UJUMP&&itype[i]!=CJUMP&&itype[i]!=SJUMP)
5015           {
5016             // Don't store a register immediately after writing it,
5017             // may prevent dual-issue.
5018             if((regs[i].regmap[r]&63)==rt1[i-1]) wont_dirty_i|=1<<r;
5019             if((regs[i].regmap[r]&63)==rt2[i-1]) wont_dirty_i|=1<<r;
5020           }
5021         }
5022       }
5023     }
5024     // Save it
5025     will_dirty[i]=will_dirty_i;
5026     wont_dirty[i]=wont_dirty_i;
5027     // Mark registers that won't be dirtied as not dirty
5028     if(wr) {
5029       /*printf("wr (%d,%d) %x will:",istart,iend,start+i*4);
5030       for(r=0;r<HOST_REGS;r++) {
5031         if((will_dirty_i>>r)&1) {
5032           printf(" r%d",r);
5033         }
5034       }
5035       printf("\n");*/
5036 
5037       regs[i].dirty|=will_dirty_i;
5038       //#ifndef DESTRUCTIVE_WRITEBACK
5039       regs[i].dirty&=wont_dirty_i;
5040       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==SJUMP)
5041       {
5042         if(i<iend-1&&itype[i]!=RJUMP&&itype[i]!=UJUMP) {
5043           for(r=0;r<HOST_REGS;r++) {
5044             if(r!=EXCLUDE_REG) {
5045               if(regs[i].regmap[r]==regmap_pre[i+2][r]) {
5046                 regs[i+2].wasdirty&=wont_dirty_i|~(1<<r);
5047               }else {/*printf("i: %x (%d) mismatch(+2): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
5048             }
5049           }
5050         }
5051       }
5052       else
5053       {
5054         if(i<iend) {
5055           for(r=0;r<HOST_REGS;r++) {
5056             if(r!=EXCLUDE_REG) {
5057               if(regs[i].regmap[r]==regmap_pre[i+1][r]) {
5058                 regs[i+1].wasdirty&=wont_dirty_i|~(1<<r);
5059               }else {/*printf("i: %x (%d) mismatch(+1): %d\n",start+i*4,i,r);assert(!((wont_dirty_i>>r)&1));*/}
5060             }
5061           }
5062         }
5063       }
5064       //#endif
5065     }
5066     // Deal with changed mappings
5067     temp_will_dirty=will_dirty_i;
5068     temp_wont_dirty=wont_dirty_i;
5069     for(r=0;r<HOST_REGS;r++) {
5070       if(r!=EXCLUDE_REG) {
5071         int nr;
5072         if(regs[i].regmap[r]==regmap_pre[i][r]) {
5073           if(wr) {
5074             //#ifndef DESTRUCTIVE_WRITEBACK
5075             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
5076             //#endif
5077             regs[i].wasdirty|=will_dirty_i&(1<<r);
5078           }
5079         }
5080         else if(regmap_pre[i][r]>=0&&(nr=get_reg(regs[i].regmap,regmap_pre[i][r]))>=0) {
5081           // Register moved to a different register
5082           will_dirty_i&=~(1<<r);
5083           wont_dirty_i&=~(1<<r);
5084           will_dirty_i|=((temp_will_dirty>>nr)&1)<<r;
5085           wont_dirty_i|=((temp_wont_dirty>>nr)&1)<<r;
5086           if(wr) {
5087             //#ifndef DESTRUCTIVE_WRITEBACK
5088             regs[i].wasdirty&=wont_dirty_i|~(1<<r);
5089             //#endif
5090             regs[i].wasdirty|=will_dirty_i&(1<<r);
5091           }
5092         }
5093         else {
5094           will_dirty_i&=~(1<<r);
5095           wont_dirty_i&=~(1<<r);
5096           if((regmap_pre[i][r]&63)>=0 && (regmap_pre[i][r]&63)<TBIT) {
5097             will_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5098             wont_dirty_i|=((unneeded_reg[i]>>(regmap_pre[i][r]&63))&1)<<r;
5099           } else {
5100             wont_dirty_i|=1<<r;
5101             /*printf("i: %x (%d) mismatch: %d\n",start+i*4,i,r);assert(!((will_dirty>>r)&1));*/
5102           }
5103         }
5104       }
5105     }
5106   }
5107 }
5108 
5109   /* disassembly */
disassemble_inst(int i)5110 void disassemble_inst(int i)
5111 {
5112     if (bt[i]) printf("*"); else printf(" ");
5113     switch(itype[i]) {
5114       case UJUMP:
5115       case CJUMP:
5116       case SJUMP:
5117         printf (" %x: %s %8x\n",start+i*2,insn[i],ba[i]);break;
5118       case RJUMP:
5119         printf (" %x: %s r%d\n",start+i*2,insn[i],rs1[i]);break;
5120       case IMM8:
5121         printf (" %x: %s #%d,r%d\n",start+i*2,insn[i],imm[i],opcode[i]==14?rt1[i]:rs1[i]);
5122         break;
5123       case LOAD:
5124         switch(addrmode[i])
5125         {
5126           case REGIND:
5127             printf (" %x: %s @r%d,r%d\n",start+i*2,insn[i],rs1[i],rt1[i]);
5128             break;
5129           case POSTINC:
5130             printf (" %x: %s @r%d+,r%d\n",start+i*2,insn[i],rs1[i],rt1[i]);
5131             break;
5132           case PREDEC:
5133             printf (" %x: %s @-r%d,r%d\n",start+i*2,insn[i],rs1[i],rt1[i]);
5134             break;
5135           case DUALIND:
5136             printf (" %x: %s @(R0,r%d),r%d\n",start+i*2,insn[i],rs1[i],rt1[i]);
5137             break;
5138           case GBRIND:
5139             printf (" %x: %s #%d,@(R0,GBR)\n",start+i*2,insn[i],imm[i]);
5140             break;
5141           case GBRDISP:
5142             printf (" %x: %s @(%d,GBR),r%d\n",start+i*2,insn[i],imm[i],rt1[i]);
5143             break;
5144           case REGDISP:
5145             printf (" %x: %s @(%d,r%d),r%d\n",start+i*2,insn[i],imm[i],rs1[i],rt1[i]);
5146             break;
5147         }
5148         break;
5149       case STORE:
5150         switch(addrmode[i])
5151         {
5152           case REGIND:
5153             printf (" %x: %s r%d,@r%d\n",start+i*2,insn[i],rs1[i],rs2[i]);
5154             break;
5155           case POSTINC:
5156             printf (" %x: %s r%d,@r%d+\n",start+i*2,insn[i],rs1[i],rs2[i]);
5157             break;
5158           case PREDEC:
5159             printf (" %x: %s r%d,@-r%d\n",start+i*2,insn[i],rs1[i],rs2[i]);
5160             break;
5161           case DUALIND:
5162             printf (" %x: %s r%d,@(R0,r%d)\n",start+i*2,insn[i],rs1[i],rs2[i]);
5163             break;
5164           case GBRDISP:
5165             printf (" %x: %s r%d,@(%d,GBR)\n",start+i*2,insn[i],rs1[i],imm[i]);
5166             break;
5167           case REGDISP:
5168             printf (" %x: %s r%d,@(%d,r%d)\n",start+i*2,insn[i],rs1[i],imm[i],rs2[i]);
5169             break;
5170         }
5171         break;
5172       case RMW:
5173         switch(addrmode[i])
5174         {
5175           case REGIND:
5176             printf (" %x: %s @r%d\n",start+i*2,insn[i],rs1[i]);
5177             break;
5178           case GBRIND:
5179             printf (" %x: %s #%d,@(R0,GBR)\n",start+i*2,insn[i],imm[i]);
5180             break;
5181         }
5182         break;
5183       case PCREL:
5184         printf (" %x: %s @(%x,PC),r%d (PC+%d=%x)",start+i*2,insn[i],imm[i],rt1[i],imm[i],((start+i*2+4)&(opcode[i]==9?~1:~3))+imm[i]);
5185         if (opcode[i]==9 && (unsigned)(i+(imm[i]>>1))<slen)
5186           printf(" [%x]\n",(s16)source[((start+i*2+4)+imm[i]-start)>>1]); // MOV.W
5187         else if (opcode[i]==13 && (unsigned)(i+(imm[i]>>1))<slen)
5188           printf(" [%8x]\n",(source[(((start+i*2+4)&~3)+imm[i]-start)>>1]<<16)+source[(((start+i*2+4)&~3)+imm[i]+2-start)>>1]); // MOV.L
5189         else printf("\n");
5190         if (opcode[i]==13 && (unsigned)(i+(imm[i]>>1))<slen)
5191           if((source[(((start+i*2+4)&~3)+imm[i]-start)>>1]<<16)+source[(((start+i*2+4)&~3)+imm[i]+2-start)>>1]-(start+i*2)<(unsigned)1024)
5192             printf("Within 1024\n");
5193         break;
5194       case ALU:
5195         if(rs1[i]<0&&rs2[i]<0) // XOR reg,reg case
5196           printf (" %x: %s r%d,r%d\n",start+i*2,insn[i],rt1[i],rt1[i]);
5197         else if(rs2[i]>=0&&rs2[i]!=TBIT)
5198           printf (" %x: %s r%d,r%d\n",start+i*2,insn[i],rs1[i],rs2[i]);
5199         else if(rt1[i]!=rs1[i])
5200           printf (" %x: %s r%d,r%d\n",start+i*2,insn[i],rs1[i],rt1[i]);
5201         else
5202           printf (" %x: %s r%d\n",start+i*2,insn[i],rs1[i]);
5203         break;
5204       case MULTDIV:
5205         //printf (" %x: %s rt1=%d rt2=%d\n",start+i*2,insn[i],rt1[i],rt2[i]);
5206         printf (" %x: %s r%d,r%d\n",start+i*2,insn[i],rs1[i],rs2[i]);
5207         break;
5208       case SHIFTIMM:
5209         if(rs2[i]>=0) printf (" %x: %s r%d,r%d #%d\n",start+i*2,insn[i],rs1[i],rs2[i],imm[i]);
5210         else printf (" %x: %s r%d #%d\n",start+i*2,insn[i],rt1[i],imm[i]);
5211         break;
5212       case MOV:
5213         printf (" %x: %s r%d,r%d\n",start+i*2,insn[i],rs1[i],rt1[i]);
5214         break;
5215       case EXT:
5216         printf (" %x: %s r%d,r%d\n",start+i*2,insn[i],rs1[i],rt1[i]);
5217         break;
5218       case FLAGS:
5219         if(opcode2[i]==9) printf (" %x: %s r%d\n",start+i*2,insn[i],rt1[i]);
5220         else printf (" %x: %s\n",start+i*2,insn[i]);
5221         break;
5222       case COMPLEX:
5223         printf (" %x: %s r%d,r%d\n",start+i*2,insn[i],rs1[i],rs2[i]);
5224         break;
5225       case DATA:
5226         printf (" %x: WORD %4x\n",start+i*2,source[i]&0xFFFF); // Constant data
5227         break;
5228       default:
5229         //printf (" %s %8x\n",insn[i],source[i]);
5230         printf (" %x: %s\n",start+i*2,insn[i]);
5231     }
5232 }
5233 
sh2_dynarec_init()5234 void sh2_dynarec_init()
5235 {
5236   int n;
5237   //printf("Init new dynarec\n");
5238   out=(u8 *)BASE_ADDR;
5239   if (mmap (out, 1<<TARGET_SIZE_2,
5240             PROT_READ | PROT_WRITE | PROT_EXEC,
5241             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
5242             -1, 0) <= 0) {printf("mmap() failed\n");}
5243   //for(n=0x80000;n<0x80800;n++)
5244   //  invalid_code[n]=1;
5245   for(n=0;n<131072;n++)
5246     cached_code[n]=0;
5247   for(n=0;n<262144;n++)
5248     cached_code_words[n]=0;
5249   for(n=0;n<65536;n++)
5250     hash_table[n][0]=hash_table[n][2]=-1;
5251   memset(mini_ht_master,-1,sizeof(mini_ht_master));
5252   memset(mini_ht_slave,-1,sizeof(mini_ht_slave));
5253   memset(restore_candidate,0,sizeof(restore_candidate));
5254   copy=shadow;
5255   expirep=16384; // Expiry pointer, +2 blocks
5256   literalcount=0;
5257   stop_after_jal=0;
5258   if (mmap ((void *)0x80000000, 4194304,
5259             PROT_READ | PROT_WRITE,
5260             MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS,
5261             -1, 0) <= 0) {printf("mmap() failed\n");}
5262 
5263   // This has to be done after BiosRom etc are allocated
5264   for(n=0;n<1048576;n++) {
5265     if(n<0x100) {
5266       #ifdef POINTERS_64BIT
5267       memory_map[n]=(((u64)BiosRom-((n<<12)&0x80000))>>2)|0x4000000000000000LL;
5268       #else
5269       memory_map[n]=(((u32)BiosRom-((n<<12)&0x80000))>>2)|0x40000000;
5270       #endif
5271     }else
5272     if(n>=0x0200&&n<0x0300) {
5273       #ifdef POINTERS_64BIT
5274       memory_map[n]=((u64)LowWram-((n<<12)&0xFFF00000))>>2;
5275       #else
5276       memory_map[n]=((u32)LowWram-((n<<12)&0xFFF00000))>>2;
5277       #endif
5278     }else
5279     if(n>=0x6000&&n<0x8000) {
5280       #ifdef POINTERS_64BIT
5281       memory_map[n]=((u64)HighWram-((n<<12)&0xFFF00000))>>2;
5282       #else
5283       memory_map[n]=((u32)HighWram-((n<<12)&0xFFF00000))>>2;
5284       #endif
5285     }else
5286     if(n>=0x20200&&n<0x20300) {
5287       #ifdef POINTERS_64BIT
5288       memory_map[n]=((u64)LowWram-((n<<12)&0xFFF00000))>>2;
5289       #else
5290       memory_map[n]=((u32)LowWram-((n<<12)&0xFFF00000))>>2;
5291       #endif
5292     }else
5293     if(n>=0x26000&&n<0x28000) {
5294       #ifdef POINTERS_64BIT
5295       memory_map[n]=((u64)HighWram-((n<<12)&0xFFF00000))>>2;
5296       #else
5297       memory_map[n]=((u32)HighWram-((n<<12)&0xFFF00000))>>2;
5298       #endif
5299     }else
5300       memory_map[n]=-1LL;
5301   }
5302 
5303   master_cc=slave_cc=0;
5304   slave_ip=(void *)0; // Slave not running, go directly to interrupt handler
5305 
5306   arch_init();
5307 }
5308 
SH2DynarecReset(SH2_struct * context)5309 void SH2DynarecReset(SH2_struct *context) {
5310 
5311   //printf("SH2DynarecReset\n");
5312   if(context==MSH2) master_cc=0;
5313   if(context==SSH2) { slave_ip=(void*)0; slave_cc=0; }
5314 }
5315 
sh2_dynarec_cleanup()5316 void sh2_dynarec_cleanup()
5317 {
5318   int n;
5319   if (munmap ((void *)BASE_ADDR, 1<<TARGET_SIZE_2) < 0) {printf("munmap() failed\n");}
5320   for(n=0;n<2048;n++) ll_clear(jump_in+n);
5321   for(n=0;n<2048;n++) ll_clear(jump_out+n);
5322   for(n=0;n<2048;n++) ll_clear(jump_dirty+n);
5323 }
5324 
sh2_recompile_block(int addr)5325 int sh2_recompile_block(int addr)
5326 {
5327   pointer beginning;
5328   int hr;
5329   int ds=0;
5330   int i,j;
5331   int done=0;
5332   unsigned int type,mode,op,op2,op3;
5333   unsigned int lastconst=0;
5334   unsigned int writelimit=0xFFFFFFFF;
5335   u32 p_constmap[SH2_REGS];
5336   u32 p_isconst=0;
5337   int cached_addr;
5338 
5339   //if(Count==365117028) tracedebug=1;
5340   assem_debug("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
5341   //printf("NOTCOMPILED: addr = %x -> %x\n", (int)addr, (int)out);
5342   //printf("TRACE: count=%d next=%d (compile %x)\n",Count,next_interupt,addr);
5343   //if(debug)
5344   //printf("TRACE: count=%d next=%d (checksum %x)\n",Count,next_interupt,mchecksum());
5345   //printf("fpu mapping=%x enabled=%x\n",(Status & 0x04000000)>>26,(Status & 0x20000000)>>29);
5346   /*if(Count>=312978186) {
5347     rlist();
5348   }*/
5349   //rlist();
5350   start = (u32)addr&~1;
5351   slave = (u32)addr&1;
5352   cached_addr = start&~0x20000000;
5353   //assert(((u32)addr&1)==0);
5354   if (cached_addr >= 0x00000000 && cached_addr < 0x00100000) {
5355     source = (u16 *)((char *)BiosRom+(start & 0x7FFFF));
5356     pagelimit = (addr|0x7FFFF) + 1;
5357   }
5358   else if (cached_addr >= 0x00200000 && cached_addr < 0x00300000) {
5359     source = (u16 *)((char *)LowWram+(start & 0xFFFFF));
5360     pagelimit = (addr|0xFFFFF) + 1;
5361   }
5362   else if (cached_addr >= 0x06000000 && cached_addr < 0x08000000) {
5363     source = (u16 *)((char *)HighWram+(start & 0xFFFFF));
5364     pagelimit = (addr|0xFFFFF) + 1;
5365   }
5366   else {
5367     printf("Compile at bogus memory address: %x \n", (int)addr);
5368     exit(1);
5369   }
5370   //printf("source= %x\n",(int)source);
5371 
5372   alignedsource=(void *)(((pointer)source)&~3);
5373 
5374   /* Pass 1: disassemble */
5375   /* Pass 2: register dependencies, branch targets */
5376   /* Pass 3: register allocation */
5377   /* Pass 4: branch dependencies */
5378   /* Pass 5: pre-alloc */
5379   /* Pass 6: optimize clean/dirty state */
5380   /* Pass 7: identify interrupt return locations */
5381   /* Pass 8: assembly */
5382   /* Pass 9: linker */
5383   /* Pass 10: garbage collection / free memory */
5384 
5385   slen=MAXBLOCK;
5386 
5387   //printf("addr = %x source = %x %x\n", addr,source,source[0]);
5388 
5389   /* Pass 1 disassembly */
5390 
5391   for(i=0;i<8;i++) {
5392     //printf("recent write: %x\n",recent_writes[i]);
5393     if(recent_writes[i]<writelimit) {
5394       if(recent_writes[i]>start) writelimit=recent_writes[i];
5395     }
5396   }
5397 
5398   for(i=0;!done;i++) {
5399     bt[i]=0;ooo[i]=0;op2=0;op3=0;mode=0;
5400     minimum_free_regs[i]=0;
5401     opcode[i]=op=source[i]>>12;
5402     strcpy(insn[i],"???"); type=NI;
5403     switch(op)
5404     {
5405       case 0x00:
5406         op2=source[i]&0xf;
5407         op3=(source[i]>>4)&0xf;
5408         switch(op2)
5409         {
5410           case 0x02: strcpy(insn[i],"STC"); type=MOV; break;
5411           case 0x03:
5412             switch(op3)
5413             {
5414               case 0x00: strcpy(insn[i],"BSRF"); type=RJUMP; break;
5415               case 0x02: strcpy(insn[i],"BRAF"); type=RJUMP; break;
5416             }
5417             break;
5418           case 0x04: strcpy(insn[i],"MOV.B"); type=STORE;mode=DUALIND; break;
5419           case 0x05: strcpy(insn[i],"MOV.W"); type=STORE;mode=DUALIND; break;
5420           case 0x06: strcpy(insn[i],"MOV.L"); type=STORE;mode=DUALIND; break;
5421           case 0x07: strcpy(insn[i],"MUL.L"); type=MULTDIV; break;
5422           case 0x08:
5423             switch(op3)
5424             {
5425               case 0x00: strcpy(insn[i],"CLRT"); type=FLAGS; break;
5426               case 0x01: strcpy(insn[i],"SETT"); type=FLAGS; break;
5427               case 0x02: strcpy(insn[i],"CLRMAC"); type=MULTDIV; break;
5428             }
5429             break;
5430           case 0x09:
5431             switch(op3)
5432             {
5433               case 0x00: strcpy(insn[i],"NOP"); type=NOP; break;
5434               case 0x01: strcpy(insn[i],"DIV0U"); type=MULTDIV; break;
5435               case 0x02: strcpy(insn[i],"MOVT"); type=FLAGS; break;
5436             }
5437             break;
5438           case 0x0A: strcpy(insn[i],"STS"); type=MOV; break;
5439           case 0x0B:
5440             switch(op3)
5441             {
5442               case 0x00: strcpy(insn[i],"RTS"); type=RJUMP; break;
5443               case 0x01: strcpy(insn[i],"SLEEP"); type=SYSTEM; break;
5444               case 0x02: strcpy(insn[i],"RTE"); type=RJUMP; break;
5445             }
5446             break;
5447           case 0x0C: strcpy(insn[i],"MOV.B"); type=LOAD;mode=DUALIND; break;
5448           case 0x0D: strcpy(insn[i],"MOV.W"); type=LOAD;mode=DUALIND; break;
5449           case 0x0E: strcpy(insn[i],"MOV.L"); type=LOAD;mode=DUALIND; break;
5450           case 0x0F: strcpy(insn[i],"MAC.L"); type=COMPLEX; break;
5451         }
5452         break;
5453       case 0x01: strcpy(insn[i],"MOV.L"); type=STORE;mode=REGDISP;op2=2; break;
5454       case 0x02:
5455         op2=source[i]&0xf;
5456         switch(op2)
5457         {
5458           case 0x00: strcpy(insn[i],"MOV.B"); type=STORE;mode=REGIND; break;
5459           case 0x01: strcpy(insn[i],"MOV.W"); type=STORE;mode=REGIND; break;
5460           case 0x02: strcpy(insn[i],"MOV.L"); type=STORE;mode=REGIND; break;
5461           case 0x04: strcpy(insn[i],"MOV.B"); type=STORE;mode=PREDEC; break;
5462           case 0x05: strcpy(insn[i],"MOV.W"); type=STORE;mode=PREDEC; break;
5463           case 0x06: strcpy(insn[i],"MOV.L"); type=STORE;mode=PREDEC; break;
5464           case 0x07: strcpy(insn[i],"DIV0S"); type=MULTDIV; break;
5465           case 0x08: strcpy(insn[i],"TST"); type=ALU; break;
5466           case 0x09: strcpy(insn[i],"AND"); type=ALU; break;
5467           case 0x0A: strcpy(insn[i],"XOR"); type=ALU; break;
5468           case 0x0B: strcpy(insn[i],"OR"); type=ALU; break;
5469           case 0x0C: strcpy(insn[i],"CMP/ST"); type=ALU; break;
5470           case 0x0D: strcpy(insn[i],"XTRCT"); type=SHIFTIMM; break;
5471           case 0x0E: strcpy(insn[i],"MULU.W"); type=MULTDIV; break;
5472           case 0x0F: strcpy(insn[i],"MULS.W"); type=MULTDIV; break;
5473         }
5474         break;
5475       case 0x03:
5476         op2=source[i]&0xf;
5477         switch(op2)
5478         {
5479           case 0x00: strcpy(insn[i],"CMP/EQ"); type=ALU; break;
5480           case 0x02: strcpy(insn[i],"CMP/HS"); type=ALU; break;
5481           case 0x03: strcpy(insn[i],"CMP/GE"); type=ALU; break;
5482           case 0x04: strcpy(insn[i],"DIV1"); type=COMPLEX; break;
5483           case 0x05: strcpy(insn[i],"DMULU.L"); type=MULTDIV; break;
5484           case 0x06: strcpy(insn[i],"CMP/HI"); type=ALU; break;
5485           case 0x07: strcpy(insn[i],"CMP/GT"); type=ALU; break;
5486           case 0x08: strcpy(insn[i],"SUB"); type=ALU; break;
5487           case 0x0A: strcpy(insn[i],"SUBC"); type=ALU; break;
5488           case 0x0B: strcpy(insn[i],"SUBV"); type=ALU; break;
5489           case 0x0C: strcpy(insn[i],"ADD"); type=ALU; break;
5490           case 0x0D: strcpy(insn[i],"DMULS.L"); type=MULTDIV; break;
5491           case 0x0E: strcpy(insn[i],"ADDC"); type=ALU; break;
5492           case 0x0F: strcpy(insn[i],"ADDV"); type=ALU; break;
5493         }
5494         break;
5495       case 0x04:
5496         op2=source[i]&0xf;
5497         op3=(source[i]>>4)&0xf;
5498         switch(op2)
5499         {
5500           case 0x00:
5501             switch(op3)
5502             {
5503               case 0x00: strcpy(insn[i],"SHLL"); type=SHIFTIMM; break;
5504               case 0x01: strcpy(insn[i],"DT"); type=ALU; break;
5505               case 0x02: strcpy(insn[i],"SHAL"); type=SHIFTIMM; break;
5506             }
5507             break;
5508           case 0x01:
5509             switch(op3)
5510             {
5511               case 0x00: strcpy(insn[i],"SHLR"); type=SHIFTIMM; break;
5512               case 0x01: strcpy(insn[i],"CMP/PZ"); type=ALU; break;
5513               case 0x02: strcpy(insn[i],"SHAR"); type=SHIFTIMM; break;
5514             }
5515             break;
5516           case 0x02: strcpy(insn[i],"STS.L"); type=STORE;mode=PREDEC; break;
5517           case 0x03: strcpy(insn[i],"STC.L"); type=STORE;mode=PREDEC; break;
5518           case 0x04:
5519             switch(op3)
5520             {
5521               case 0x00: strcpy(insn[i],"ROTL"); type=SHIFTIMM; break;
5522               case 0x02: strcpy(insn[i],"ROTCL"); type=SHIFTIMM; break;
5523             }
5524             break;
5525           case 0x05:
5526             switch(op3)
5527             {
5528               case 0x00: strcpy(insn[i],"ROTR"); type=SHIFTIMM; break;
5529               case 0x01: strcpy(insn[i],"CMP/PL"); type=ALU; break;
5530               case 0x02: strcpy(insn[i],"ROTCR"); type=SHIFTIMM; break;
5531             }
5532             break;
5533           case 0x06: strcpy(insn[i],"LDS.L"); type=LOAD;mode=POSTINC; break;
5534           case 0x07: strcpy(insn[i],"LDC.L"); type=LOAD;mode=POSTINC; break;
5535           case 0x08:
5536             switch(op3)
5537             {
5538               case 0x00: strcpy(insn[i],"SHLL2"); type=SHIFTIMM; break;
5539               case 0x01: strcpy(insn[i],"SHLL8"); type=SHIFTIMM; break;
5540               case 0x02: strcpy(insn[i],"SHLL16"); type=SHIFTIMM; break;
5541             }
5542             break;
5543           case 0x09:
5544             switch(op3)
5545             {
5546               case 0x00: strcpy(insn[i],"SHLR2"); type=SHIFTIMM; break;
5547               case 0x01: strcpy(insn[i],"SHLR8"); type=SHIFTIMM; break;
5548               case 0x02: strcpy(insn[i],"SHLR16"); type=SHIFTIMM; break;
5549             }
5550             break;
5551           case 0x0A: strcpy(insn[i],"LDS"); type=MOV; break;
5552           case 0x0B:
5553             switch(op3)
5554             {
5555               case 0x00: strcpy(insn[i],"JSR"); type=RJUMP; break;
5556               case 0x01: strcpy(insn[i],"TAS.B"); type=RMW;mode=REGIND; break;
5557               case 0x02: strcpy(insn[i],"JMP"); type=RJUMP; break;
5558             }
5559             break;
5560           case 0x0E: strcpy(insn[i],"LDC"); type=MOV; break;
5561           case 0x0F: strcpy(insn[i],"MAC.W"); type=COMPLEX; break;
5562         }
5563         break;
5564       case 0x05: strcpy(insn[i],"MOV.L"); type=LOAD;mode=REGDISP;op2=2; break;
5565       case 0x06:
5566         op2=source[i]&0xf;
5567         switch(op2)
5568         {
5569           case 0x00: strcpy(insn[i],"MOV.B"); type=LOAD;mode=REGIND; break;
5570           case 0x01: strcpy(insn[i],"MOV.W"); type=LOAD;mode=REGIND; break;
5571           case 0x02: strcpy(insn[i],"MOV.L"); type=LOAD;mode=REGIND; break;
5572           case 0x03: strcpy(insn[i],"MOV"); type=MOV; break;
5573           case 0x04: strcpy(insn[i],"MOV.B"); type=LOAD;mode=POSTINC; break;
5574           case 0x05: strcpy(insn[i],"MOV.W"); type=LOAD;mode=POSTINC; break;
5575           case 0x06: strcpy(insn[i],"MOV.L"); type=LOAD;mode=POSTINC; break;
5576           case 0x07: strcpy(insn[i],"NOT"); type=ALU; break;
5577           case 0x08: strcpy(insn[i],"SWAP.B"); type=ALU; break;
5578           case 0x09: strcpy(insn[i],"SWAP.W"); type=ALU; break;
5579           case 0x0A: strcpy(insn[i],"NEGC"); type=ALU; break;
5580           case 0x0B: strcpy(insn[i],"NEG"); type=ALU; break;
5581           case 0x0C: strcpy(insn[i],"EXTU.B"); type=EXT; break;
5582           case 0x0D: strcpy(insn[i],"EXTU.W"); type=EXT; break;
5583           case 0x0E: strcpy(insn[i],"EXTS.B"); type=EXT; break;
5584           case 0x0F: strcpy(insn[i],"EXTS.W"); type=EXT; break;
5585         }
5586         break;
5587       case 0x07: strcpy(insn[i],"ADD"); type=IMM8; break;
5588       case 0x08:
5589         op2=(source[i]>>8)&0xf;
5590         switch(op2)
5591         {
5592           case 0x00: strcpy(insn[i],"MOV.B"); type=STORE;mode=REGDISP; break;
5593           case 0x01: strcpy(insn[i],"MOV.W"); type=STORE;mode=REGDISP; break;
5594           case 0x04: strcpy(insn[i],"MOV.B"); type=LOAD;mode=REGDISP; break;
5595           case 0x05: strcpy(insn[i],"MOV.W"); type=LOAD;mode=REGDISP; break;
5596           case 0x08: strcpy(insn[i],"CMP/EQ"); type=IMM8; break;
5597           case 0x09: strcpy(insn[i],"BT"); type=CJUMP; break;
5598           case 0x0B: strcpy(insn[i],"BF"); type=CJUMP; break;
5599           case 0x0D: strcpy(insn[i],"BT/S"); type=SJUMP; break;
5600           case 0x0F: strcpy(insn[i],"BF/S"); type=SJUMP; break;
5601         }
5602         break;
5603       case 0x09: strcpy(insn[i],"MOV.W"); type=PCREL; break;
5604       case 0x0A: strcpy(insn[i],"BRA"); type=UJUMP; break;
5605       case 0x0B: strcpy(insn[i],"BSR"); type=UJUMP; break;
5606       case 0x0C:
5607         op2=(source[i]>>8)&0xf;
5608         switch(op2)
5609         {
5610           case 0x00: strcpy(insn[i],"MOV.B"); type=STORE;mode=GBRDISP; break;
5611           case 0x01: strcpy(insn[i],"MOV.W"); type=STORE;mode=GBRDISP; break;
5612           case 0x02: strcpy(insn[i],"MOV.L"); type=STORE;mode=GBRDISP; break;
5613           case 0x03: strcpy(insn[i],"TRAPA"); type=SYSTEM; break;
5614           case 0x04: strcpy(insn[i],"MOV.B"); type=LOAD;mode=GBRDISP; break;
5615           case 0x05: strcpy(insn[i],"MOV.W"); type=LOAD;mode=GBRDISP; break;
5616           case 0x06: strcpy(insn[i],"MOV.L"); type=LOAD;mode=GBRDISP; break;
5617           case 0x07: strcpy(insn[i],"MOVA"); type=PCREL; break;
5618           case 0x08: strcpy(insn[i],"TST"); type=IMM8; break;
5619           case 0x09: strcpy(insn[i],"AND"); type=IMM8; break;
5620           case 0x0A: strcpy(insn[i],"XOR"); type=IMM8; break;
5621           case 0x0B: strcpy(insn[i],"OR"); type=IMM8; break;
5622           case 0x0C: strcpy(insn[i],"TST.B"); type=LOAD;mode=GBRIND; break;
5623           case 0x0D: strcpy(insn[i],"AND.B"); type=RMW;mode=GBRIND; break;
5624           case 0x0E: strcpy(insn[i],"XOR.B"); type=RMW;mode=GBRIND; break;
5625           case 0x0F: strcpy(insn[i],"OR.B"); type=RMW;mode=GBRIND; break;
5626         }
5627         break;
5628       case 0x0D: strcpy(insn[i],"MOV.L"); type=PCREL; break;
5629       case 0x0E: strcpy(insn[i],"MOV"); type=IMM8; break;
5630       default: strcpy(insn[i],"???"); type=NI; break;
5631     }
5632     itype[i]=type;
5633     addrmode[i]=mode;
5634     opcode2[i]=op2;
5635     opcode3[i]=op3;
5636     /* Get registers/immediates */
5637     rs1[i]=-1;
5638     rs2[i]=-1;
5639     rs3[i]=-1;
5640     rt1[i]=-1;
5641     rt2[i]=-1;
5642     lt1[i]=-1;
5643     cycles[i]=1;
5644     switch(type) {
5645       case LOAD:
5646         if(mode==GBRDISP||mode==GBRIND) rs1[i]=GBR;
5647         else rs1[i]=(source[i]>>4)&0xf;
5648         if(mode==DUALIND||mode==GBRIND) rs2[i]=0;
5649         if(op==4) {
5650           // LDS/LDC
5651           rs1[i]=(source[i]>>8)&0xf;
5652           if(op2==6) rt1[i]=((source[i]>>4)&0xf)+MACH;
5653           if(op2==7) {rt1[i]=((source[i]>>4)&0xf)+SR;cycles[i]=3;}
5654           if(rt1[i]==SR) rt2[i]=TBIT;
5655         }
5656         else if(op==8)
5657           rt1[i]=0; // (@disp,rm),r0
5658         else if(op==12) {
5659           if(op2!=12)
5660             rt1[i]=0; // (@disp,GBR),r0
5661           else {
5662             imm[i]=(unsigned int)((unsigned char)source[i]);
5663             rt1[i]=TBIT; // TST.B
5664             cycles[i]=3;
5665           }
5666         }
5667         else {
5668           rt1[i]=(source[i]>>8)&0xf;
5669         }
5670         if(mode==REGDISP) {
5671           imm[i]=(unsigned int)source[i]&0xF;
5672           if(op==5) imm[i]<<=2; // MOV.L
5673           if(op==8&&op2==5) imm[i]<<=1; // MOV.W
5674         }
5675         else if(mode==GBRDISP) {
5676           imm[i]=(unsigned int)((unsigned char)source[i])<<(op2&3);
5677         }
5678         else if(mode!=GBRIND) imm[i]=0;
5679         if(mode==POSTINC) rt2[i]=rs1[i];
5680         break;
5681       case STORE:
5682         if(op==4) {
5683           // STS/STC
5684           if(op2==2) rs1[i]=((source[i]>>4)&0xf)+MACH;
5685           if(op2==3) {rs1[i]=((source[i]>>4)&0xf)+SR;cycles[i]=2;}
5686           if(rs1[i]==SR) rs3[i]=TBIT;
5687         }
5688         else
5689         if(op==8)
5690           rs1[i]=0; // r0,(@disp,rn)
5691         else if(op==12)
5692           rs1[i]=0; // r0,(@disp,GBR)
5693         else
5694           rs1[i]=(source[i]>>4)&0xf;
5695         if(mode==GBRDISP) rs2[i]=GBR;
5696         else if(op==8) rs2[i]=(source[i]>>4)&0xf; // r0,(@disp,rn)
5697         else rs2[i]=(source[i]>>8)&0xf;
5698         if(mode==DUALIND) rs3[i]=0;
5699         if(mode==REGDISP) {
5700           imm[i]=(unsigned int)source[i]&0xF;
5701           if(op==1) imm[i]<<=2; // MOV.L
5702           if(op==8&&op2==1) imm[i]<<=1; // MOV.W
5703         }
5704         else if(mode==GBRDISP) {
5705           imm[i]=(unsigned int)((unsigned char)source[i])<<(op2&3);
5706         }
5707         else imm[i]=0;
5708         if(mode==PREDEC) rt1[i]=rs2[i];
5709         if( (mode==DUALIND&&((p_isconst>>rs2[i])&(p_isconst>>rs3[i])&1)) ||
5710             (mode!=DUALIND&&((p_isconst>>rs2[i])&1)) )
5711         {
5712           u32 addr;
5713           if(mode==DUALIND) addr=p_constmap[rs2[i]]+p_constmap[rs3[i]];
5714           if(mode==REGDISP||mode==GBRDISP) addr=p_constmap[rs2[i]]+imm[i];
5715           if(mode==PREDEC) addr=(p_constmap[rs2[i]]-=4);
5716           if(mode==REGIND) addr=p_constmap[rs2[i]];
5717           if(addr>start+i*2&&addr<writelimit) writelimit=addr;
5718           assem_debug("Instruction at %x possibly writes %x (limit=%x)\n",start+i*2,addr,writelimit);
5719         }
5720         break;
5721       case RMW:
5722         if(op==4) // TAS.B
5723         {
5724           rs1[i]=(source[i]>>8)&0xf;
5725           rt1[i]=TBIT;
5726           imm[i]=0;
5727           cycles[i]=4;
5728         }
5729         if(op==12) // AND.B/XOR.B/OR.B
5730         {
5731           rs1[i]=GBR;
5732           rs2[i]=0;
5733           imm[i]=(unsigned int)((unsigned char)source[i]);
5734           cycles[i]=3;
5735         }
5736         break;
5737       case PCREL:
5738         imm[i]=(signed int)((unsigned char)source[i]);
5739         if(op==12) rt1[i]=0; // MOVA
5740         else rt1[i]=(source[i]>>8)&0xf;
5741         if(op==9) imm[i]<<=1; // MOV.W
5742         else imm[i]<<=2;
5743         // Extend block to include consts
5744         // FIXME: Don't go past limit
5745         if (op==9 && lastconst < (start+i*2+4)+imm[i]) // MOV.W
5746           lastconst = (start+i*2+4)+imm[i];
5747         if (op==13 && lastconst < ((start+i*2+4)&~3)+imm[i]+2) // MOV.L
5748           lastconst = ((start+i*2+4)&~3)+imm[i]+2;
5749         //printf("lastconst=%x\n",lastconst);
5750         break;
5751       case MOV:
5752         if(op==6) {
5753           rs1[i]=(source[i]>>4)&0xf;
5754           rt1[i]=(source[i]>>8)&0xf;
5755         }
5756         if(op==0) { // STC/STS
5757           if(op2==2) rs1[i]=((source[i]>>4)&0xf)+SR; //STC
5758           if(op2==10) rs1[i]=((source[i]>>4)&0xf)+MACH; //STS
5759           rt1[i]=(source[i]>>8)&0xf;
5760           if(rs1[i]==SR) rs2[i]=TBIT; // For liveness analysis
5761         }
5762         if(op==4) { // LDC/LDS
5763           if(op2==14) rt1[i]=((source[i]>>4)&0xf)+SR; //LDC
5764           if(op2==10) rt1[i]=((source[i]>>4)&0xf)+MACH; //LDS
5765           rs1[i]=(source[i]>>8)&0xf;
5766           if(rt1[i]==SR) rt2[i]=TBIT; // For liveness analysis
5767         }
5768         break;
5769       case IMM8:
5770         if(op==8) { // CMP/EQ r0
5771           rs1[i]=0;
5772           rt1[i]=TBIT;
5773           imm[i]=(signed int)((signed char)source[i]);
5774         }else
5775         if(op==12) {
5776           rs1[i]=0;
5777           if(op2==8)
5778             rt1[i]=TBIT; // TST
5779           else
5780             rt1[i]=0; // AND/XOR/OR
5781           imm[i]=(unsigned int)((unsigned char)source[i]);
5782         }else{ // ADD/MOV
5783           if(op==7) rs1[i]=(source[i]>>8)&0xf; // ADD
5784           rt1[i]=(source[i]>>8)&0xf;
5785           imm[i]=(signed int)((signed char)source[i]);
5786         }
5787         break;
5788       case FLAGS:
5789         if(op2==8) rt1[i]=TBIT; // CLRT/SETT
5790         if(op2==9) {rs1[i]=TBIT;rt1[i]=(source[i]>>8)&0xf;} // MOVT
5791         break;
5792       case ALU:
5793         if(op==2) {
5794           if(op2==8||op2==12) { // TST or CMP/STR
5795             rs1[i]=(source[i]>>4)&0xf;
5796             rs2[i]=(source[i]>>8)&0xf;
5797             rt1[i]=TBIT;
5798           }
5799           else
5800           { // AND/OR/XOR
5801             rs1[i]=(source[i]>>4)&0xf;
5802             rs2[i]=(source[i]>>8)&0xf;
5803             rt1[i]=(source[i]>>8)&0xf;
5804             if(op2==10&&rs1[i]==rs2[i]) {
5805               rs1[i]=-1;rs2[i]=-1; // Optimize XOR reg,reg
5806             }
5807           }
5808         }
5809         if(op==3) {
5810           if(op2<8) { // CMP
5811             rs1[i]=(source[i]>>4)&0xf;
5812             rs2[i]=(source[i]>>8)&0xf;
5813             rt1[i]=TBIT;
5814           }
5815           else
5816           { // ADD/SUB
5817             rs1[i]=(source[i]>>4)&0xf;
5818             rs2[i]=(source[i]>>8)&0xf;
5819             rt1[i]=(source[i]>>8)&0xf;
5820             if(op2==10||op2==14) rs3[i]=TBIT; // ADDC/SUBC read T bit
5821             if(op2!=8&&op2!=12) // ADDC/ADDV/SUBC/SUBV set T bit
5822               rt2[i]=TBIT;
5823           }
5824         }
5825         if(op==4) { // DT and compare with zero
5826           rs1[i]=(source[i]>>8)&0xf;
5827           if(op2==0) rt1[i]=(source[i]>>8)&0xf; // DT
5828           rt2[i]=TBIT;
5829         }
5830         if(op==6) { // NOT/NEG/NEGC/SWAP
5831           rs1[i]=(source[i]>>4)&0xf;
5832           rt1[i]=(source[i]>>8)&0xf;
5833           if(op2==10)
5834             rs2[i]=rt2[i]=TBIT; // NEGC sets T bit
5835         }
5836         break;
5837       case EXT:
5838         rs1[i]=(source[i]>>4)&0xf;
5839         rt1[i]=(source[i]>>8)&0xf;
5840         break;
5841       case MULTDIV:
5842         if(op==0) {
5843           if(op2==7) // MUL.L
5844           {
5845             rs1[i]=(source[i]>>4)&0xf;
5846             rs2[i]=(source[i]>>8)&0xf;
5847             rt1[i]=MACL;
5848             cycles[i]=2;
5849           }
5850           if(op2==8) // CLRMAC
5851           {
5852             rt1[i]=MACH;
5853             rt2[i]=MACL;
5854           }
5855           if(op2==9) // DIV0U
5856           {
5857             rs1[i]=SR;
5858             rt1[i]=SR;
5859             rt2[i]=TBIT;
5860           }
5861         }
5862         if(op==2) {
5863           if(op2==7) // DIV0S
5864           {
5865             rs1[i]=(source[i]>>4)&0xf;
5866             rs2[i]=(source[i]>>8)&0xf;
5867             rs3[i]=SR;
5868             rt1[i]=SR;
5869             rt2[i]=TBIT;
5870           }
5871           if(op2==14) // MULU.W
5872           {
5873             rs1[i]=(source[i]>>4)&0xf;
5874             rs2[i]=(source[i]>>8)&0xf;
5875             rt1[i]=MACL;
5876           }
5877           if(op2==15) // MULS.W
5878           {
5879             rs1[i]=(source[i]>>4)&0xf;
5880             rs2[i]=(source[i]>>8)&0xf;
5881             rt1[i]=MACL;
5882           }
5883         }
5884         if(op==3) {
5885           if(op2==5) // DMULU.L
5886           {
5887             rs1[i]=(source[i]>>4)&0xf;
5888             rs2[i]=(source[i]>>8)&0xf;
5889             rt1[i]=MACH;
5890             rt2[i]=MACL;
5891             cycles[i]=2;
5892           }
5893           if(op2==13) // DMULS.L
5894           {
5895             rs1[i]=(source[i]>>4)&0xf;
5896             rs2[i]=(source[i]>>8)&0xf;
5897             rt1[i]=MACH;
5898             rt2[i]=MACL;
5899             cycles[i]=2;
5900           }
5901         }
5902         break;
5903       case SHIFTIMM:
5904         rs1[i]=(source[i]>>8)&0xf;
5905         rt1[i]=(source[i]>>8)&0xf;
5906         if(op==4) {
5907           if(op2<6) rt2[i]=TBIT;
5908           if(op2==4||op2==5) {if(op3==2) rs2[i]=TBIT;} // ROTCL/ROTCR
5909         }
5910         if(op==2&op2==13) { // XTRCT
5911           rs1[i]=(source[i]>>4)&0xf;
5912           rs2[i]=(source[i]>>8)&0xf;
5913         }
5914         break;
5915       case UJUMP:
5916         rs2[i]=CCREG;
5917         if(op==11) rt1[i]=PR; // BSR
5918         cycles[i]=2;
5919         break;
5920       case RJUMP:
5921         rs1[i]=(source[i]>>8)&0xf;
5922         if (op==0&&op2==11&&op3==0) rs1[i]=PR; // RTS
5923         if ((op==0&&op2==3)||(op==4&&op2==11)) { // BSRF/JSR
5924           if(op3==0) rt1[i]=PR;
5925         }
5926         rs2[i]=CCREG;
5927         cycles[i]=2;
5928         if(op==0&&op2==11&&op3==2) { // RTE
5929           rs1[i]=15; // Stack pointer
5930           rs2[i]=CCREG;
5931           rt1[i]=SR;
5932           rt2[i]=15;
5933           cycles[i]=4;
5934         }
5935         break;
5936       case CJUMP:
5937         rs1[i]=TBIT;
5938         rs2[i]=CCREG;
5939         //cycles[i]=3; // Will be adjusted if branch is taken
5940         break;
5941       case SJUMP:
5942         rs1[i]=TBIT;
5943         rs2[i]=CCREG;
5944         //cycles[i]=2; // Will be adjusted if branch is taken
5945         break;
5946       case SYSTEM:
5947         if(op2==11&&op3==2) { // RTE
5948           rs1[i]=15; // Stack pointer
5949           rs2[i]=CCREG;
5950           rt1[i]=SR;
5951           rt2[i]=TBIT;
5952           cycles[i]=4;
5953         }
5954         else if(op==12) { // TRAPA
5955           rs1[i]=SR; // Status/flags
5956           //rs2[i]=CCREG;
5957           rs2[i]=VBR;
5958           rs3[i]=15; // Stack pointer
5959           imm[i]=(unsigned int)((unsigned char)source[i]);
5960           cycles[i]=8;
5961         }
5962         else { // SLEEP
5963           rs2[i]=CCREG;
5964           cycles[i]=8;
5965         }
5966         break;
5967       case COMPLEX:
5968         if(op==3&&op2==4) { // DIV1
5969           rs1[i]=(source[i]>>4)&0xf;
5970           rs2[i]=(source[i]>>8)&0xf;
5971           rs3[i]=SR;
5972           rt1[i]=(source[i]>>8)&0xf;
5973           rt2[i]=SR;
5974         }
5975         if(op==0&&op2==15) { // MAC.L
5976           rs1[i]=(source[i]>>4)&0xf;
5977           rs2[i]=(source[i]>>8)&0xf;
5978           rs3[i]=SR;
5979           rt1[i]=(source[i]>>4)&0xf;
5980           rt2[i]=(source[i]>>8)&0xf;
5981           cycles[i]=3;
5982         }
5983         if(op==4&&op2==15) { // MAC.W
5984           rs1[i]=(source[i]>>4)&0xf;
5985           rs2[i]=(source[i]>>8)&0xf;
5986           rs3[i]=SR;
5987           rt1[i]=(source[i]>>4)&0xf;
5988           rt2[i]=(source[i]>>8)&0xf;
5989           cycles[i]=3;
5990         }
5991         break;
5992     }
5993     // Do preliminary constant propagation
5994     do_consts(i,&p_isconst,p_constmap);
5995     /* Calculate branch target addresses */
5996     if(type==UJUMP)
5997       ba[i]=start+i*2+4+((((signed int)source[i])<<20)>>19);
5998     else if(type==CJUMP||type==SJUMP)
5999       ba[i]=start+i*2+4+((((signed int)source[i])<<24)>>23);
6000     else
6001     {
6002       ba[i]=-1;
6003       if(type==RJUMP) {
6004         if(op!=0||op2!=11||op3!=2) { // !RTE
6005           if((p_isconst>>rs1[i])&1)
6006           {
6007             u32 constaddr=p_constmap[rs1[i]];
6008             if(op==0&&op2==3) {
6009               // PC-relative branch, add PC+4
6010               constaddr+=start+i*2+4;
6011             }
6012             ba[i]=constaddr;
6013           }
6014         }
6015       }
6016     }
6017 
6018     // If the branch target was previously identified as data, back up
6019     if(ba[i]>start&&ba[i]<start+i*2) {
6020       //assert(itype[(ba[i]-start)>>1]!=DATA);
6021       if(itype[(ba[i]-start)>>1]==DATA||itype[(ba[i]+2-start)>>1]==DATA) {
6022         //printf("back up and redecode %x\n",ba[i]);
6023         i=(ba[i]-2-start)>>1;
6024         continue;
6025       }
6026     }
6027     /* Is this the end of the block? */
6028     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP)) {
6029       if(rt1[i-1]!=PR) { // Continue past subroutine call (BSR/JSR)
6030         unsigned int firstbt=0xFFFFFFFF;
6031         done=1;
6032         // Find next branch target (if any)
6033         for(j=i-1;j>=0;j--)
6034         {
6035           if(ba[j]>start+i*2-2&&ba[j]<firstbt) firstbt=ba[j];
6036         }
6037         // See if there are any backward branches following that one
6038         //printf("firstbt=%x diff=%d\n",firstbt,firstbt-(start+i*2));
6039         if(firstbt-(start+i*2)<(unsigned)4096) {
6040           u32 branch_addr;
6041           for(j=(firstbt-start)>>1;j<MAXBLOCK;j++) {
6042             if((source[j]&0xF900)==0x8900) { //BT(S)/BF(S)
6043               branch_addr=start+j*2+4+((((signed int)source[j])<<24)>>23);
6044               if(branch_addr>start+i*2&&branch_addr<firstbt) firstbt=branch_addr;
6045               //printf("firstbt=%x\n",firstbt);
6046             }
6047             if((source[j]&0xE000)==0xA000) { //BRA/BSR
6048               branch_addr=start+j*2+4+((((signed int)source[j])<<20)>>19);
6049               if(branch_addr>start+i*2&&branch_addr<firstbt) firstbt=branch_addr;
6050               //printf("firstbt=%x\n",firstbt);
6051               if((source[j]&0xF000)==0xA000) break; //BRA (stop after unconditional branch)
6052             }
6053             if((source[j]&0xF007)==0x0003) break; //BRAF/BSRF/RTS/RTE (stop after unconditional branch)
6054           }
6055         }
6056         // Skip constant pool
6057         // FIXME: check pagelimit
6058         while(start+i*2+2<=lastconst&&start+i*2+2<firstbt&&start+i*2+1024<writelimit&&i<MAXBLOCK-1) {
6059           i++;
6060           rs1[i]=-1;
6061           rs2[i]=-1;
6062           rs3[i]=-1;
6063           rt1[i]=-1;
6064           rt2[i]=-1;
6065           lt1[i]=-1;
6066           itype[i]=DATA;
6067           bt[i]=0;ba[i]=-1;
6068           ooo[i]=0;cycles[i]=0;is_ds[i]=0;
6069         }
6070         // Does the block continue due to a branch?
6071         if(firstbt==start+i*2) done=j=0; // Branch into delay slot
6072         if(firstbt==start+i*2+2) done=j=0;
6073         if(firstbt==start+i*2+4) done=j=0; // CHECK: Is this useful?
6074       }
6075       else {
6076         if(stop_after_jal) done=1;
6077         // Stop on BREAK
6078         //if((source[i+1]&0xfc00003f)==0x0d) done=1;
6079       }
6080       // Don't recompile stuff that's already compiled
6081       if(check_addr(start+i*2+2+slave)) done=1;
6082       // Don't get too close to the limit
6083       if(i>MAXBLOCK/2) done=1;
6084     }
6085     if(yabsys.emulatebios) {
6086       if(start+i*2>=0x200&&start+i*2<0x600) {
6087         strcpy(insn[i],"(BIOS)");
6088         itype[i]=BIOS;
6089         done=1;
6090       }
6091     }
6092     //if(i>0&&itype[i-1]==SYSCALL&&stop_after_jal) done=1;
6093     //if(i>0&&itype[i-1]==SYSTEM&&source[i-1]==0x002B) done=1; // RTE
6094     //assert(i<MAXBLOCK-1);
6095     if(start+i*2==pagelimit-2) done=1;
6096     assert(start+i*2<pagelimit);
6097     if (i==MAXBLOCK-1) done=1;
6098     // Stop if we're compiling junk
6099     if(itype[i]==NI&&opcode[i]==0x11) {
6100       done=stop_after_jal=1;
6101       printf("Disabled speculative precompilation\n");
6102     }
6103     if(!done&&i<MAXBLOCK-1) {
6104       // Constant propagation
6105       //if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP)) isconst[i+1]=0;
6106       if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP)) p_isconst=0;
6107     }
6108   }
6109   slen=i;
6110   assert(slen>0);
6111 
6112   /* Pass 2 - Register dependencies and branch targets */
6113 
6114   // Flag branch targets
6115   for(i=0;i<slen;i++)
6116   {
6117     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6118     {
6119       // If subroutine call, flag return address as a possible branch target
6120       if(rt1[i]==PR && i<slen-2) bt[i+2]=1;
6121 
6122       if(ba[i]>=start && ba[i]<(start+slen*2) ) {
6123         // Possibly internal branch, flag target
6124         bt[(ba[i]-start)>>1]=1;
6125       }
6126     }
6127   }
6128 
6129   // Do constant propagation
6130   p_isconst=0;
6131   for(i=0;i<slen;i++)
6132   {
6133     if(bt[i])
6134     {
6135       // Can't do constant propagation if a branch target intervenes
6136       p_isconst=0;
6137     }
6138     if(i>1&&(itype[i-2]==UJUMP||itype[i-2]==RJUMP)) p_isconst=0;
6139     if(i>0&&(itype[i-1]==UJUMP||itype[i-1]==RJUMP)) p_isconst=0;
6140     if(i>0&&(itype[i-1]==CJUMP||itype[i-1]==SJUMP)) p_isconst=0;
6141     do_consts(i,&p_isconst,p_constmap);
6142     if(itype[i]==RJUMP) {
6143       if(opcode[i]!=0||opcode2[i]!=11||opcode3[i]!=2) { // Not RTE
6144         if((p_isconst>>rs1[i])&1) {
6145           // Do constant propagation, branch to fixed address
6146           u32 constaddr=p_constmap[rs1[i]];
6147           if(opcode[i]==0&&opcode2[i]==3) {
6148             // PC-relative branch, add PC+4
6149             constaddr+=start+i*2+4;
6150           }
6151           ba[i]=constaddr;
6152           //if(internal_branch(constaddr))
6153           //  if(!bt[(constaddr-start)>>1]) printf("oops: %x\n",constaddr);
6154           //assert(bt[(constaddr-start)>>1]);
6155         }
6156       }
6157     }
6158     // No stack-based addressing modes in the delay slot,
6159     // to avoid incorrect constants due to pre-incrementing.
6160     // TODO: This really should only drop the address register
6161     if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==SJUMP) {
6162       if((source[i+1]&0xF00A)==0x4002) p_isconst=0;
6163       if((source[i+1]&0xB00E)==0x2004) p_isconst=0;
6164       if((source[i+1]&0xB00F)==0x2006) p_isconst=0;
6165     }
6166     memcpy(regs[i].constmap,p_constmap,sizeof(u32)*SH2_REGS);
6167     regs[i].isconst=p_isconst;
6168   }
6169   unneeded_registers(0,slen-1,0);
6170 
6171   /* Pass 3 - Register allocation */
6172 
6173   {
6174   struct regstat current; // Current register allocations/status
6175   int cc=0;
6176   current.dirty=0;
6177   current.u=unneeded_reg[0];
6178   clear_all_regs(current.regmap);
6179   alloc_reg(&current,0,CCREG);
6180   dirty_reg(&current,CCREG);
6181   current.isdoingcp=0;
6182   current.wasdoingcp=0;
6183 
6184   for(i=0;i<slen;i++)
6185   {
6186     if(bt[i])
6187     {
6188       // Can't do constant propagation if a branch target intervenes
6189       current.isdoingcp=0;
6190     }
6191     memcpy(regmap_pre[i],current.regmap,sizeof(current.regmap));
6192     //printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
6193     regs[i].wasdoingcp=current.isdoingcp;
6194     regs[i].wasdirty=current.dirty;
6195     if(itype[i]==UJUMP||itype[i]==SJUMP||itype[i]==RJUMP) {
6196       if(i+1<slen) {
6197         //current.u=branch_unneeded_reg[i]&~((1LL<<rs1[i+1])|(1LL<<rs2[i+1]));
6198         current.u=branch_unneeded_reg[i];
6199         //if(rt1[i+1]>=0) current.u|=1LL<<rt1[i+1];
6200         //if(rt2[i+1]>=0) current.u|=1LL<<rt2[i+1];
6201         if(rs1[i+1]>=0) current.u&=~(1LL<<rs1[i+1]);
6202         if(rs2[i+1]>=0) current.u&=~(1LL<<rs2[i+1]);
6203         if(rs3[i+1]>=0) current.u&=~(1LL<<rs3[i+1]);
6204         if(rs1[i+1]==TBIT||rs2[i+1]==TBIT) current.u&=~(1LL<<SR);
6205         if(rt1[i+1]==TBIT||rt2[i+1]==TBIT) current.u&=~(1LL<<SR);
6206         //current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6207         if(rs1[i]>=0) current.u&=~(1LL<<rs1[i]);
6208         if(rs2[i]>=0) current.u&=~(1LL<<rs2[i]); // CCREG
6209         if(rs1[i]==TBIT||rs2[i]==TBIT) current.u&=~(1LL<<SR); // BT/S BF/S
6210         regs[i].u=current.u;
6211       } else { printf("oops, branch at end of block with no delay slot\n");exit(1); }
6212     }else if(itype[i]==CJUMP) {
6213       current.u=branch_unneeded_reg[i];
6214       regs[i].u=current.u;
6215       if(rs1[i]>=0) current.u&=~(1LL<<rs1[i]);
6216       if(rs2[i]>=0) current.u&=~(1LL<<rs2[i]); // CCREG
6217       if(rs1[i]==TBIT||rs2[i]==TBIT) current.u&=~(1LL<<SR); // BT BF
6218     } else {
6219       if(i+1<slen) {
6220         regs[i].u=unneeded_reg[i+1];
6221         //current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
6222         //current.u=unneeded_reg[i+1]&~((1LL<<rs1[i])|(1LL<<rs2[i])|(1LL<<rs3[i]));
6223         current.u=unneeded_reg[i+1];
6224         if(rs1[i]>=0) current.u&=~(1LL<<rs1[i]);
6225         if(rs2[i]>=0) current.u&=~(1LL<<rs2[i]);
6226         if(rs3[i]>=0) current.u&=~(1LL<<rs3[i]);
6227         if(rs1[i]==TBIT||rs2[i]==TBIT) current.u&=~(1LL<<SR);
6228         if(rt1[i]==TBIT||rt2[i]==TBIT) current.u&=~(1LL<<SR);
6229       } else {
6230         current.u=0;
6231       }
6232     }
6233     is_ds[i]=ds;
6234     if(ds) {
6235       struct regstat temp;
6236       ds=0; // Skip delay slot, already allocated as part of branch
6237       // ...but we need to alloc it in case something jumps here
6238       if(i+1<slen) {
6239         current.u=branch_unneeded_reg[i-1]&unneeded_reg[i+1];
6240       }else{
6241         current.u=branch_unneeded_reg[i-1];
6242       }
6243       current.u&=~((1LL<<rs1[i])|(1LL<<rs2[i]));
6244       memcpy(&temp,&current,sizeof(current));
6245       temp.wasdirty=temp.dirty;
6246       // TODO: Take into account unconditional branches, as below
6247       delayslot_alloc(&temp,i);
6248       memcpy(regs[i].regmap,temp.regmap,sizeof(temp.regmap));
6249       regs[i].wasdirty=temp.wasdirty;
6250       regs[i].dirty=temp.dirty;
6251       regs[i].isdoingcp=0;
6252       regs[i].wasdoingcp=0;
6253       current.isdoingcp=0;
6254       // Create entry (branch target) regmap
6255       for(hr=0;hr<HOST_REGS;hr++)
6256       {
6257         int r=temp.regmap[hr];
6258         if(r>=0) {
6259           if(r!=regmap_pre[i][hr]) {
6260             regs[i].regmap_entry[hr]=-1;
6261           }
6262           else
6263           {
6264             if((current.u>>r)&1) {
6265               regs[i].regmap_entry[hr]=-1;
6266               regs[i].regmap[hr]=-1;
6267               //Don't clear regs in the delay slot as the branch might need them
6268               //current.regmap[hr]=-1;
6269             }else
6270               regs[i].regmap_entry[hr]=r;
6271           }
6272         } else {
6273           // First instruction expects CCREG to be allocated
6274           if(i==0&&hr==HOST_CCREG)
6275             regs[i].regmap_entry[hr]=CCREG;
6276           else
6277             regs[i].regmap_entry[hr]=-1;
6278         }
6279       }
6280     }
6281     else { // Not delay slot
6282       switch(itype[i]) {
6283         case UJUMP:
6284           //current.isdoingcp=0; // DEBUG
6285           //current.wasdoingcp=0; // DEBUG
6286           //regs[i].wasdoingcp=0; // DEBUG
6287           clear_const(&current,rt1[i]);
6288           alloc_cc(&current,i);
6289           dirty_reg(&current,CCREG);
6290           if (rt1[i]==PR) {
6291             alloc_reg(&current,i,PR);
6292             dirty_reg(&current,PR);
6293             assert(rs1[i+1]!=PR&&rs2[i+1]!=PR);
6294             #ifdef REG_PREFETCH
6295             alloc_reg(&current,i,PTEMP);
6296             #endif
6297           }
6298           ooo[i]=1;
6299           delayslot_alloc(&current,i+1);
6300           //current.isdoingcp=0; // DEBUG
6301           ds=1;
6302           //printf("i=%d, isdoingcp=%x\n",i,current.isdoingcp);
6303           break;
6304         case RJUMP:
6305           //current.isdoingcp=0;
6306           //current.wasdoingcp=0;
6307           //regs[i].wasdoingcp=0;
6308           clear_const(&current,rs1[i]);
6309           clear_const(&current,rt1[i]);
6310           alloc_cc(&current,i);
6311           dirty_reg(&current,CCREG);
6312           if(opcode[i]==0&&opcode2[i]==11&&opcode3[i]==2) { // RTE
6313             alloc_reg(&current,i,15); // Stack reg
6314             dirty_reg(&current,15);
6315             alloc_reg(&current,i,SR); // SR will be loaded from stack
6316             dirty_reg(&current,SR);
6317             assert(rt1[i+1]!=15&&rt2[i+1]!=15);
6318             assert(rt1[i+1]!=SR&&rt2[i+1]!=SR);
6319             assert(rt1[i+1]!=TBIT&&rt2[i+1]!=TBIT);
6320             delayslot_alloc(&current,i+1);
6321           }
6322           else
6323           if(rs1[i]!=rt1[i+1]&&rs1[i]!=rt2[i+1]) {
6324             alloc_reg(&current,i,rs1[i]);
6325             if (rt1[i]==PR) {
6326               alloc_reg(&current,i,rt1[i]);
6327               dirty_reg(&current,rt1[i]);
6328               assert(rs1[i+1]!=PR&&rs2[i+1]!=PR);
6329               if(rs1[i+1]==PR||rs2[i+1]==PR) {printf("OOPS\n");}
6330               #ifdef REG_PREFETCH
6331               alloc_reg(&current,i,PTEMP);
6332               #endif
6333             }
6334             #ifdef USE_MINI_HT
6335             if(rs1[i]==PR) { // BSRF/JSR
6336               alloc_reg(&current,i,RHASH);
6337               #ifndef HOST_IMM_ADDR32
6338               alloc_reg(&current,i,RHTBL);
6339               #endif
6340             }
6341             #endif
6342             // PC-relative branch needs a temporary register to add PC
6343             if(opcode[i]==0&&opcode2[i]==3) alloc_reg(&current,i,RTEMP);
6344             delayslot_alloc(&current,i+1);
6345           } else {
6346             // The delay slot overwrites our source register,
6347             // allocate a temporary register to hold the old value.
6348             current.isdoingcp=0;
6349             current.wasdoingcp=0;
6350             regs[i].wasdoingcp=0;
6351             delayslot_alloc(&current,i+1);
6352             current.isdoingcp=0;
6353             alloc_reg(&current,i,RTEMP);
6354           }
6355           //current.isdoingcp=0; // DEBUG
6356           ooo[i]=1;
6357           ds=1;
6358           break;
6359         case CJUMP:
6360           //current.isdoingcp=0;
6361           //current.wasdoingcp=0;
6362           //regs[i].wasdoingcp=0;
6363           clear_const(&current,rs1[i]);
6364           clear_const(&current,rs2[i]);
6365           alloc_cc(&current,i);
6366           dirty_reg(&current,CCREG);
6367           alloc_reg(&current,i,SR);
6368           // No delay slot, don't do constant propagation
6369           current.isdoingcp=0;
6370           current.wasdoingcp=0;
6371           regs[i].wasdoingcp=0;
6372           //ds=1; // BT/BF don't have delay slots
6373           break;
6374         case SJUMP:
6375           //current.isdoingcp=0;
6376           //current.wasdoingcp=0;
6377           //regs[i].wasdoingcp=0;
6378           clear_const(&current,rs1[i]);
6379           clear_const(&current,rt1[i]);
6380           alloc_cc(&current,i);
6381           dirty_reg(&current,CCREG);
6382           alloc_reg(&current,i,SR);
6383           if(rt1[i+1]==TBIT||rt2[i+1]==TBIT||rt1[i+1]==SR||rt2[i+1]==SR) {
6384             // The delay slot overwrites the branch condition.
6385             // Allocate the branch condition registers instead.
6386             current.isdoingcp=0;
6387             current.wasdoingcp=0;
6388             regs[i].wasdoingcp=0;
6389           }
6390           else
6391           if(itype[i+1]==COMPLEX) {
6392             // The MAC and DIV instructions make function calls which
6393             // do not save registers.  Do the branch and update the
6394             // cycle count first.
6395             current.isdoingcp=0;
6396             current.wasdoingcp=0;
6397             regs[i].wasdoingcp=0;
6398           }
6399           else
6400           {
6401             ooo[i]=1;
6402             delayslot_alloc(&current,i+1);
6403           }
6404           ds=1;
6405           //current.isdoingcp=0;
6406           break;
6407         case IMM8:
6408           imm8_alloc(&current,i);
6409           break;
6410         case LOAD:
6411           load_alloc(&current,i);
6412           break;
6413         case STORE:
6414           store_alloc(&current,i);
6415           break;
6416         case RMW:
6417           rmw_alloc(&current,i);
6418           break;
6419         case PCREL:
6420           pcrel_alloc(&current,i);
6421           break;
6422         case ALU:
6423           alu_alloc(&current,i);
6424           break;
6425         case MULTDIV:
6426           multdiv_alloc(&current,i);
6427           break;
6428         case SHIFTIMM:
6429           shiftimm_alloc(&current,i);
6430           break;
6431         case MOV:
6432           mov_alloc(&current,i);
6433           break;
6434         case EXT:
6435           ext_alloc(&current,i);
6436           break;
6437         case FLAGS:
6438           flags_alloc(&current,i);
6439           break;
6440         case COMPLEX:
6441           complex_alloc(&current,i);
6442           break;
6443         case SYSTEM:
6444           system_alloc(&current,i);
6445           break;
6446       }
6447 
6448       //printf("xxx: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",current.regmap[0],current.regmap[1],current.regmap[2],current.regmap[3],current.regmap[5],current.regmap[6],current.regmap[7]);
6449 
6450       // Create entry (branch target) regmap
6451       for(hr=0;hr<HOST_REGS;hr++)
6452       {
6453         int r,or,er;
6454         r=current.regmap[hr];
6455         if(r>=0) {
6456           if(r!=regmap_pre[i][hr]) {
6457             // TODO: delay slot (?)
6458             or=get_reg(regmap_pre[i],r); // Get old mapping for this register
6459             if(or<0||(r&63)>=TEMPREG){
6460               regs[i].regmap_entry[hr]=-1;
6461             }
6462             else
6463             {
6464               // Just move it to a different register
6465               regs[i].regmap_entry[hr]=r;
6466               // If it was dirty before, it's still dirty
6467               if((regs[i].wasdirty>>or)&1) dirty_reg(&current,r&63);
6468             }
6469           }
6470           else
6471           {
6472             if(r<64){
6473               if((current.u>>r)&1) {
6474                 regs[i].regmap_entry[hr]=-1;
6475                 //regs[i].regmap[hr]=-1;
6476                 current.regmap[hr]=-1;
6477               }else
6478                 regs[i].regmap_entry[hr]=r;
6479             }
6480           }
6481         } else {
6482           // Branches expect CCREG to be allocated at the target
6483           if(regmap_pre[i][hr]==CCREG)
6484             regs[i].regmap_entry[hr]=CCREG;
6485           else
6486             regs[i].regmap_entry[hr]=-1;
6487         }
6488       }
6489       memcpy(regs[i].regmap,current.regmap,sizeof(current.regmap));
6490     }
6491     /* Branch post-alloc */
6492     if(i>0)
6493     {
6494       current.wasdirty=current.dirty;
6495       switch(itype[i-1]) {
6496         case UJUMP:
6497           memcpy(&branch_regs[i-1],&current,sizeof(current));
6498           branch_regs[i-1].isdoingcp=0;
6499           branch_regs[i-1].wasdoingcp=0;
6500           branch_regs[i-1].isconst=0;
6501           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
6502           alloc_cc(&branch_regs[i-1],i-1);
6503           dirty_reg(&branch_regs[i-1],CCREG);
6504           if(rt1[i-1]==PR) { // BSR
6505             alloc_reg(&branch_regs[i-1],i-1,PR);
6506             dirty_reg(&branch_regs[i-1],PR);
6507           }
6508           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
6509           memcpy(cpmap[i],cpmap[i-1],sizeof(current.cpmap));
6510           break;
6511         case RJUMP:
6512           memcpy(&branch_regs[i-1],&current,sizeof(current));
6513           branch_regs[i-1].isdoingcp=0;
6514           branch_regs[i-1].wasdoingcp=0;
6515           branch_regs[i-1].isconst=0;
6516           branch_regs[i-1].u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i-1])|(1LL<<rs2[i-1]));
6517           alloc_cc(&branch_regs[i-1],i-1);
6518           dirty_reg(&branch_regs[i-1],CCREG);
6519           alloc_reg(&branch_regs[i-1],i-1,rs1[i-1]);
6520           if(rt1[i-1]==PR) { // BSRF/JSR
6521             alloc_reg(&branch_regs[i-1],i-1,rt1[i-1]);
6522             dirty_reg(&branch_regs[i-1],rt1[i-1]);
6523           }
6524           #ifdef USE_MINI_HT
6525           if(rs1[i-1]==PR) { // RTS
6526             alloc_reg(&branch_regs[i-1],i-1,RHASH);
6527             #ifndef HOST_IMM_ADDR32
6528             alloc_reg(&branch_regs[i-1],i-1,RHTBL);
6529             #endif
6530           }
6531           #endif
6532           if(opcode[i-1]==0&&opcode2[i-1]==11&&opcode3[i-1]==2) { // RTE
6533             alloc_reg(&branch_regs[i-1],i-1,SR); // SR will be loaded from stack
6534             dirty_reg(&branch_regs[i-1],SR);
6535             alloc_reg(&branch_regs[i-1],i-1,RTEMP);
6536             alloc_reg(&branch_regs[i-1],i-1,MOREG);
6537           }
6538           memcpy(&branch_regs[i-1].regmap_entry,&branch_regs[i-1].regmap,sizeof(current.regmap));
6539           memcpy(cpmap[i],cpmap[i-1],sizeof(current.cpmap));
6540           break;
6541         case SJUMP:
6542           alloc_cc(&current,i-1);
6543           dirty_reg(&current,CCREG);
6544           if(rt1[i]==TBIT||rt2[i]==TBIT||rt1[i]==SR||rt2[i]==SR||itype[i]==COMPLEX) {
6545             // The delay slot overwrote the branch condition
6546             // Delay slot goes after the test (in order)
6547             current.u=branch_unneeded_reg[i-1]&~((1LL<<rs1[i])|(1LL<<rs2[i]));
6548             delayslot_alloc(&current,i);
6549             current.isdoingcp=0;
6550           }
6551           else
6552           {
6553             current.u=branch_unneeded_reg[i-1]&~(1LL<<rs1[i-1]);
6554             // Alloc the branch condition register
6555             alloc_reg(&current,i-1,SR);
6556           }
6557           memcpy(&branch_regs[i-1],&current,sizeof(current));
6558           branch_regs[i-1].isdoingcp=0;
6559           branch_regs[i-1].wasdoingcp=0;
6560           branch_regs[i-1].isconst=0;
6561           memcpy(&branch_regs[i-1].regmap_entry,&current.regmap,sizeof(current.regmap));
6562           memcpy(cpmap[i],cpmap[i-1],sizeof(current.cpmap));
6563           break;
6564       }
6565 
6566       if(itype[i-1]==UJUMP||itype[i-1]==RJUMP||itype[i]==DATA)
6567       {
6568         if(rt1[i-1]==PR&&itype[i]!=DATA) // BSR/JSR
6569         {
6570           // Subroutine call will return here, don't alloc any registers
6571           current.dirty=0;
6572           clear_all_regs(current.regmap);
6573           alloc_reg(&current,i,CCREG);
6574           dirty_reg(&current,CCREG);
6575         }
6576         else if(i+1<slen)
6577         {
6578           // Internal branch will jump here, match registers to caller
6579           current.dirty=0;
6580           clear_all_regs(current.regmap);
6581           alloc_reg(&current,i,CCREG);
6582           dirty_reg(&current,CCREG);
6583           for(j=i-1;j>=0;j--)
6584           {
6585             if(ba[j]==start+i*2+2) {
6586               if(itype[j]==CJUMP) {
6587                 memcpy(current.regmap,regs[j].regmap,sizeof(current.regmap));
6588                 current.dirty=regs[j].dirty;
6589               }else{
6590                 memcpy(current.regmap,branch_regs[j].regmap,sizeof(current.regmap));
6591                 current.dirty=branch_regs[j].dirty;
6592               }
6593               break;
6594             }
6595           }
6596           while(j>=0) {
6597             if(ba[j]==start+i*2+2) {
6598               for(hr=0;hr<HOST_REGS;hr++) {
6599                 if(itype[j]==CJUMP) {
6600                   if(current.regmap[hr]!=regs[j].regmap[hr]) {
6601                     current.regmap[hr]=-1;
6602                   }
6603                   current.dirty&=regs[j].dirty;
6604                 }else{
6605                   if(current.regmap[hr]!=branch_regs[j].regmap[hr]) {
6606                     current.regmap[hr]=-1;
6607                   }
6608                   current.dirty&=branch_regs[j].dirty;
6609                 }
6610               }
6611             }
6612             j--;
6613           }
6614         }
6615       }
6616     }
6617 
6618     // Count cycles in between branches
6619     ccadj[i]=cc;
6620     if(i>0&&(itype[i-1]==RJUMP||itype[i-1]==UJUMP))
6621     {
6622       cc=0;
6623     }
6624     else
6625     if(itype[i]==CJUMP||itype[i]==SJUMP)
6626     {
6627       cc=1;
6628     }
6629     else
6630     {
6631       cc+=cycles[i];
6632     }
6633 
6634     if(!is_ds[i]) {
6635       regs[i].dirty=current.dirty;
6636       regs[i].isdoingcp=current.isdoingcp;
6637       memcpy(cpmap[i],current.cpmap,sizeof(current.cpmap));
6638     }
6639     for(hr=0;hr<HOST_REGS;hr++) {
6640       if(hr!=EXCLUDE_REG&&regs[i].regmap[hr]>=0) {
6641         if(regmap_pre[i][hr]!=regs[i].regmap[hr]) {
6642           regs[i].wasdoingcp&=~(1<<hr);
6643         }
6644       }
6645     }
6646   }
6647   }
6648 
6649   /* Pass 4 - Cull unused host registers */
6650 
6651   {
6652   u64 nr=0;
6653 
6654   for (i=slen-1;i>=0;i--)
6655   {
6656     int hr;
6657     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP)
6658     {
6659       if(ba[i]<start || ba[i]>=(start+slen*2))
6660       {
6661         // Branch out of this block, don't need anything
6662         nr=0;
6663       }
6664       else
6665       {
6666         // Internal branch
6667         // Need whatever matches the target
6668         int t=(ba[i]-start)>>1;
6669         nr=0;
6670         for(hr=0;hr<HOST_REGS;hr++)
6671         {
6672           if(regs[i].regmap_entry[hr]>=0) {
6673             if(regs[i].regmap_entry[hr]==regs[t].regmap_entry[hr]) nr|=1<<hr;
6674           }
6675         }
6676       }
6677       // Conditional branch may need registers for following instructions
6678       if(itype[i]==SJUMP)
6679       {
6680         if(i<slen-2) {
6681           nr|=needed_reg[i+2];
6682           for(hr=0;hr<HOST_REGS;hr++)
6683           {
6684             if(regmap_pre[i+2][hr]>=0&&get_reg(regs[i+2].regmap_entry,regmap_pre[i+2][hr])<0) nr&=~(1<<hr);
6685             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*2,hr,regmap_entry[i+2][hr]);
6686           }
6687         }
6688       }
6689       else if(itype[i]==CJUMP)
6690       {
6691         if(i<slen-2) {
6692           nr|=needed_reg[i+1];
6693           for(hr=0;hr<HOST_REGS;hr++)
6694           {
6695             if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
6696             //if((regmap_entry[i+2][hr])>=0) if(!((nr>>hr)&1)) printf("%x-bogus(%d=%d)\n",start+i*2,hr,regmap_entry[i+2][hr]);
6697           }
6698         }
6699       }
6700       // Don't need stuff which is overwritten
6701       for(hr=0;hr<HOST_REGS;hr++) {
6702         if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
6703         if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
6704       }
6705       // Merge in delay slot
6706       if(itype[i]!=CJUMP)
6707       for(hr=0;hr<HOST_REGS;hr++)
6708       {
6709         // These are overwritten by the delay slot
6710         if(rt1[i+1]>=0&&rt1[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6711         if(rt2[i+1]>=0&&rt2[i+1]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6712         if(rs1[i+1]>=0&&rs1[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
6713         if(rs2[i+1]>=0&&rs2[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
6714         if(rs3[i+1]>=0&&rs3[i+1]==regmap_pre[i][hr]) nr|=1<<hr;
6715         if(rs1[i+1]>=0&&rs1[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6716         if(rs2[i+1]>=0&&rs2[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6717         if(rs3[i+1]>=0&&rs3[i+1]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6718         //if(dep1[i+1]&&!((unneeded_reg_upper[i]>>dep1[i+1])&1)) {
6719         //  if(dep1[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6720         //  if(dep2[i+1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6721         //}
6722         //if(dep2[i+1]&&!((unneeded_reg_upper[i]>>dep2[i+1])&1)) {
6723         //  if(dep1[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6724         //  if(dep2[i+1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6725         //}
6726         if(regs[i].regmap_entry[hr]==SR) nr|=1<<hr;
6727         if(regs[i].regmap[hr]==SR) nr|=1<<hr;
6728         if(regmap_pre[i][hr]==SR) nr|=1<<hr;
6729       }
6730     }
6731     else if(itype[i]==SYSTEM)
6732     {
6733       // TRAPA instruction (software interrupt)
6734       nr=0;
6735       for(hr=0;hr<HOST_REGS;hr++)
6736       {
6737         // Source registers are needed
6738         if(regmap_pre[i][hr]==15) nr|=1<<hr;
6739         if(regmap_pre[i][hr]==SR) nr|=1<<hr;
6740         if(regmap_pre[i][hr]==VBR) nr|=1<<hr;
6741         if(regmap_pre[i][hr]==CCREG) nr|=1<<hr;
6742         if(regs[i].regmap_entry[hr]==15) nr|=1<<hr;
6743         if(regs[i].regmap_entry[hr]==SR) nr|=1<<hr;
6744         if(regs[i].regmap_entry[hr]==VBR) nr|=1<<hr;
6745         if(regs[i].regmap_entry[hr]==CCREG) nr|=1<<hr;
6746       }
6747     }
6748     else // Non-branch
6749     {
6750       if(i<slen-1) {
6751         for(hr=0;hr<HOST_REGS;hr++) {
6752           if(regmap_pre[i+1][hr]>=0&&get_reg(regs[i+1].regmap_entry,regmap_pre[i+1][hr])<0) nr&=~(1<<hr);
6753           if(regs[i].regmap[hr]!=regmap_pre[i+1][hr]) nr&=~(1<<hr);
6754           if(regs[i].regmap[hr]!=regmap_pre[i][hr]) nr&=~(1<<hr);
6755           if(regs[i].regmap[hr]<0) nr&=~(1<<hr);
6756         }
6757       }
6758     }
6759     for(hr=0;hr<HOST_REGS;hr++)
6760     {
6761       // Overwritten registers are not needed
6762       if(rt1[i]>=0&&rt1[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6763       if(rt2[i]>=0&&rt2[i]==(regs[i].regmap[hr]&63)) nr&=~(1<<hr);
6764       // Source registers are needed
6765       if(rs1[i]>=0&&rs1[i]==regmap_pre[i][hr]) nr|=1<<hr;
6766       if(rs2[i]>=0&&rs2[i]==regmap_pre[i][hr]) nr|=1<<hr;
6767       if(rs3[i]>=0&&rs3[i]==regmap_pre[i][hr]) nr|=1<<hr;
6768       if(rs1[i]>=0&&rs1[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6769       if(rs2[i]>=0&&rs2[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6770       if(rs3[i]>=0&&rs3[i]==regs[i].regmap_entry[hr]) nr|=1<<hr;
6771       //if(dep1[i]&&!((unneeded_reg_upper[i]>>dep1[i])&1)) {
6772       //  if(dep1[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6773       //  if(dep1[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6774       //}
6775       //if(dep2[i]&&!((unneeded_reg_upper[i]>>dep2[i])&1)) {
6776       //  if(dep2[i]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6777       //  if(dep2[i]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6778       //}
6779       if(regs[i].regmap_entry[hr]==SR) nr|=1<<hr;
6780       if(regs[i].regmap[hr]==SR) nr|=1<<hr;
6781       if(regmap_pre[i][hr]==SR) nr|=1<<hr;
6782       // Don't store a register immediately after writing it,
6783       // may prevent dual-issue.
6784       // But do so if this is a branch target, otherwise we
6785       // might have to load the register before the branch.
6786       if(i>0&&!bt[i]&&((regs[i].wasdirty>>hr)&1)) {
6787         if(regmap_pre[i][hr]>=0&&!((unneeded_reg[i]>>regmap_pre[i][hr])&1)) {
6788           if(rt1[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6789           if(rt2[i-1]==(regmap_pre[i][hr]&63)) nr|=1<<hr;
6790         }
6791         if(regs[i].regmap_entry[hr]>=0&&!((unneeded_reg[i]>>regs[i].regmap_entry[hr])&1)) {
6792           if(rt1[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6793           if(rt2[i-1]==(regs[i].regmap_entry[hr]&63)) nr|=1<<hr;
6794         }
6795       }
6796     }
6797     // Cycle count is needed at branches.  Assume it is needed at the target too.
6798     if(i==0||bt[i]||itype[i]==CJUMP||itype[i]==SJUMP) {
6799       if(regmap_pre[i][HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
6800       if(regs[i].regmap_entry[HOST_CCREG]==CCREG) nr|=1<<HOST_CCREG;
6801     }
6802     // Save it
6803     needed_reg[i]=nr;
6804 
6805     // Deallocate unneeded registers
6806     for(hr=0;hr<HOST_REGS;hr++)
6807     {
6808       if(!((nr>>hr)&1)) {
6809         if(regs[i].regmap_entry[hr]!=CCREG) regs[i].regmap_entry[hr]=-1;
6810         if((regs[i].regmap[hr]&63)!=rs1[i] && (regs[i].regmap[hr]&63)!=rs2[i] &&
6811            (regs[i].regmap[hr]&63)!=rt1[i] && (regs[i].regmap[hr]&63)!=rt2[i] &&
6812            (regs[i].regmap[hr]&63)!=PTEMP && (regs[i].regmap[hr]&63)!=CCREG)
6813         {
6814           if(itype[i]==CJUMP) {
6815             regs[i].regmap[hr]=-1;
6816             regs[i].isdoingcp&=~(1<<hr);
6817             if(i<slen-1) {
6818               regmap_pre[i+1][hr]=-1;
6819               regs[i+1].wasdoingcp&=~(1<<hr);
6820             }
6821           }
6822         }
6823         if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==SJUMP)
6824         {
6825           int temp1=-1,temp2=-1;
6826           //if(get_reg(regs[i].regmap,rt1[i+1]|64)>=0||get_reg(branch_regs[i].regmap,rt1[i+1]|64)>=0)
6827           //{
6828           //  d1=dep1[i+1];
6829           //  d2=dep2[i+1];
6830           //}
6831           if(itype[i+1]==LOAD || itype[i+1]==STORE ||
6832              itype[i+1]==RMW || itype[i+1]==PCREL ||
6833              itype[i+1]==SYSTEM || source[i]==0x002B /* RTE */ )
6834             temp1=MOREG;
6835           if(itype[i+1]==COMPLEX) {
6836             temp1=MACH;
6837             temp2=MACL;
6838           }
6839           if(regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] && regs[i].regmap[hr]!=rs3[i] &&
6840              regs[i].regmap[hr]!=rt1[i] && regs[i].regmap[hr]!=rt2[i] &&
6841              regs[i].regmap[hr]!=rs1[i+1] && regs[i].regmap[hr]!=rs2[i+1] && regs[i].regmap[hr]!=rs3[i+1] &&
6842              regs[i].regmap[hr]!=rt1[i+1] && regs[i].regmap[hr]!=rt2[i+1] &&
6843              regs[i].regmap[hr]!=RHASH && regs[i].regmap[hr]!=RHTBL &&
6844              regs[i].regmap[hr]!=RTEMP && regs[i].regmap[hr]!=PTEMP &&
6845              regs[i].regmap[hr]!=CCREG &&
6846              regs[i].regmap[hr]!=temp1 && regs[i].regmap[hr]!=temp2 )
6847           {
6848             regs[i].regmap[hr]=-1;
6849             regs[i].isdoingcp&=~(1<<hr);
6850             if(branch_regs[i].regmap[hr]!=rs1[i] && branch_regs[i].regmap[hr]!=rs2[i] && branch_regs[i].regmap[hr]!=rs3[i] &&
6851                branch_regs[i].regmap[hr]!=rt1[i] && branch_regs[i].regmap[hr]!=rt2[i] &&
6852                branch_regs[i].regmap[hr]!=rs1[i+1] && branch_regs[i].regmap[hr]!=rs2[i+1] && branch_regs[i].regmap[hr]!=rs3[i+1] &&
6853                branch_regs[i].regmap[hr]!=rt1[i+1] && branch_regs[i].regmap[hr]!=rt2[i+1] &&
6854                branch_regs[i].regmap[hr]!=RHASH && branch_regs[i].regmap[hr]!=RHTBL &&
6855                branch_regs[i].regmap[hr]!=RTEMP && branch_regs[i].regmap[hr]!=PTEMP &&
6856                branch_regs[i].regmap[hr]!=CCREG &&
6857                branch_regs[i].regmap[hr]!=temp1 && branch_regs[i].regmap[hr]!=temp2)
6858             {
6859               branch_regs[i].regmap[hr]=-1;
6860               branch_regs[i].regmap_entry[hr]=-1;
6861               if(itype[i]!=RJUMP&&itype[i]!=UJUMP)
6862               {
6863                 if(i<slen-2) {
6864                   regmap_pre[i+2][hr]=-1;
6865                   regs[i+2].wasdoingcp&=~(1<<hr);
6866                 }
6867               }
6868             }
6869           }
6870         }
6871         else
6872         {
6873           // Non-branch
6874           if(i>0)
6875           {
6876             int temp1=-1,temp2=-1;
6877             //if(get_reg(regs[i].regmap,rt1[i]|64)>=0)
6878             //{
6879             //  d1=dep1[i];
6880             //  d2=dep2[i];
6881             //}
6882             if(itype[i]==LOAD || itype[i]==STORE || itype[i]==RMW ||
6883                itype[i]==PCREL || itype[i]==SYSTEM )
6884               temp1=MOREG;
6885             if(itype[i]==COMPLEX) {
6886               temp1=MACH;
6887               temp2=MACL;
6888             }
6889             else if(itype[i]==SYSTEM) {
6890               temp2=CCREG;
6891             }
6892             if(regs[i].regmap[hr]!=rt1[i] && regs[i].regmap[hr]!=rt2[i] &&
6893                regs[i].regmap[hr]!=rs1[i] && regs[i].regmap[hr]!=rs2[i] &&
6894                regs[i].regmap[hr]!=rs3[i] &&
6895                regs[i].regmap[hr]!=temp1 && regs[i].regmap[hr]!=temp2 &&
6896                regs[i].regmap[hr]!=CCREG)
6897             {
6898               if(i<slen-1&&!is_ds[i]) {
6899                 if(regmap_pre[i+1][hr]!=-1 || regs[i].regmap[hr]!=-1)
6900                 if(regmap_pre[i+1][hr]!=regs[i].regmap[hr])
6901                 {
6902                   printf("fail: %x (%d %d!=%d)\n",start+i*2,hr,regmap_pre[i+1][hr],regs[i].regmap[hr]);
6903                   assert(regmap_pre[i+1][hr]==regs[i].regmap[hr]);
6904                 }
6905                 regmap_pre[i+1][hr]=-1;
6906                 if(regs[i+1].regmap_entry[hr]==CCREG) regs[i+1].regmap_entry[hr]=-1;
6907                 regs[i+1].wasdoingcp&=~(1<<hr);
6908               }
6909               regs[i].regmap[hr]=-1;
6910               regs[i].isdoingcp&=~(1<<hr);
6911             }
6912           }
6913         }
6914       }
6915     }
6916   }
6917   }
6918 
6919   /* Pass 5 - Pre-allocate registers */
6920 
6921   // If a register is allocated during a loop, try to allocate it for the
6922   // entire loop, if possible.  This avoids loading/storing registers
6923   // inside of the loop.
6924   {
6925   signed char f_regmap[HOST_REGS];
6926   clear_all_regs(f_regmap);
6927   for(i=0;i<slen-1;i++)
6928   {
6929     if(itype[i]==UJUMP||itype[i]==SJUMP||itype[i]==CJUMP)
6930     {
6931       if(ba[i]>=start && ba[i]<(start+i*2))
6932       if(itype[i]==CJUMP||itype[i+1]==NOP||itype[i+1]==MOV||itype[i+1]==ALU
6933       ||itype[i+1]==SHIFTIMM||itype[i+1]==IMM8||itype[i+1]==LOAD
6934       ||itype[i+1]==STORE||itype[i+1]==RMW||itype[i+1]==PCREL||itype[i+1]==EXT||itype[i+1]==FLAGS)
6935       {
6936         // Track register allocation
6937         int t=(ba[i]-start)>>1;
6938         if(t>0&&(itype[t-1]!=UJUMP&&itype[t-1]!=RJUMP&&itype[t-1]!=SJUMP)) // loop_preload can't handle jumps into delay slots
6939         if(t<2||(itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||rt1[t-2]!=PR) // call/ret assumes no registers allocated
6940         for(hr=0;hr<HOST_REGS;hr++)
6941         {
6942           if(regs[i].regmap[hr]>=0) {
6943             if(f_regmap[hr]!=regs[i].regmap[hr]) {
6944               // dealloc old register
6945               int n;
6946               for(n=0;n<HOST_REGS;n++)
6947               {
6948                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
6949               }
6950               // and alloc new one
6951               f_regmap[hr]=regs[i].regmap[hr];
6952             }
6953           }
6954           if(branch_regs[i].regmap[hr]>=0) {
6955             if(f_regmap[hr]!=branch_regs[i].regmap[hr]) {
6956               // dealloc old register
6957               int n;
6958               for(n=0;n<HOST_REGS;n++)
6959               {
6960                 if(f_regmap[n]==branch_regs[i].regmap[hr]) {f_regmap[n]=-1;}
6961               }
6962               // and alloc new one
6963               f_regmap[hr]=branch_regs[i].regmap[hr];
6964             }
6965           }
6966           if(ooo[i]) {
6967             if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1])
6968               f_regmap[hr]=branch_regs[i].regmap[hr];
6969           }else{
6970             if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1])
6971               f_regmap[hr]=branch_regs[i].regmap[hr];
6972           }
6973           // Avoid dirty->clean transition
6974           //if(t>0) if(get_reg(regmap_pre[t],f_regmap[hr])>=0) if((regs[t].wasdirty>>get_reg(regmap_pre[t],f_regmap[hr]))&1) f_regmap[hr]=-1;
6975           // This check isn't required, but it's a good idea.  We can't hoist
6976           // the load if the register was already allocated, so there's no
6977           // point wasting time analyzing most of these cases.  It only
6978           // "succeeds" when the mapping was different and the load can be
6979           // replaced with a mov, which is of negligible benefit.  So such
6980           // cases are skipped below.
6981           if(f_regmap[hr]>=0) {
6982             if(regs[t].regmap[hr]==f_regmap[hr]||(regs[t].regmap_entry[hr]<0&&get_reg(regmap_pre[t],f_regmap[hr])<0)) {
6983               int r=f_regmap[hr];
6984               for(j=t;j<=i;j++)
6985               {
6986                 //printf("Test %x -> %x, %x %d/%d\n",start+i*2,ba[i],start+j*2,hr,r);
6987                 if(r<TBIT&&((unneeded_reg[j]>>r)&1)) break;
6988                 if(regs[j].regmap[hr]==f_regmap[hr]&&(f_regmap[hr]&63)<TEMPREG) {
6989                   //printf("Hit %x -> %x, %x %d/%d\n",start+i*2,ba[i],start+j*2,hr,r);
6990                   int k;
6991                   if(regs[i].regmap[hr]==-1&&branch_regs[i].regmap[hr]==-1) {
6992                     if(get_reg(regs[i+2].regmap,f_regmap[hr])>=0) break;
6993                     k=i;
6994                     while(k>1&&regs[k-1].regmap[hr]==-1) {
6995                       if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
6996                         //printf("no free regs for store %x\n",start+(k-1)*4);
6997                         break;
6998                       }
6999                       if(get_reg(regs[k-1].regmap,f_regmap[hr])>=0) {
7000                         //printf("no-match due to different register\n");
7001                         break;
7002                       }
7003                       if(itype[k-2]==UJUMP||itype[k-2]==RJUMP||itype[k-2]==CJUMP||itype[k-2]==SJUMP) {
7004                         //printf("no-match due to branch\n");
7005                         break;
7006                       }
7007                       // call/ret fast path assumes no registers allocated
7008                       if(k>2&&(itype[k-3]==UJUMP||itype[k-3]==RJUMP)&&rt1[k-3]==PR) {
7009                         break;
7010                       }
7011                       k--;
7012                     }
7013                     if(regs[k-1].regmap[hr]==f_regmap[hr]&&regmap_pre[k][hr]==f_regmap[hr]) {
7014                       //printf("Extend r%d, %x ->\n",hr,start+k*4);
7015                       while(k<i) {
7016                         regs[k].regmap_entry[hr]=f_regmap[hr];
7017                         regs[k].regmap[hr]=f_regmap[hr];
7018                         regmap_pre[k+1][hr]=f_regmap[hr];
7019                         regs[k].wasdirty&=~(1<<hr);
7020                         regs[k].dirty&=~(1<<hr);
7021                         regs[k].wasdirty|=(1<<hr)&regs[k-1].dirty;
7022                         regs[k].dirty|=(1<<hr)&regs[k].wasdirty;
7023                         regs[k].wasdoingcp&=~(1<<hr);
7024                         regs[k].isdoingcp&=~(1<<hr);
7025                         k++;
7026                       }
7027                     }
7028                     else {
7029                       //printf("Fail Extend r%d, %x ->\n",hr,start+k*4);
7030                       break;
7031                     }
7032                     assert(regs[i-1].regmap[hr]==f_regmap[hr]);
7033                     if(regs[i-1].regmap[hr]==f_regmap[hr]&&regmap_pre[i][hr]==f_regmap[hr]) {
7034                       //printf("OK fill %x (r%d)\n",start+i*4,hr);
7035                       regs[i].regmap_entry[hr]=f_regmap[hr];
7036                       regs[i].regmap[hr]=f_regmap[hr];
7037                       regs[i].wasdirty&=~(1<<hr);
7038                       regs[i].dirty&=~(1<<hr);
7039                       regs[i].wasdirty|=(1<<hr)&regs[i-1].dirty;
7040                       regs[i].dirty|=(1<<hr)&regs[i-1].dirty;
7041                       regs[i].wasdoingcp&=~(1<<hr);
7042                       regs[i].isdoingcp&=~(1<<hr);
7043                       branch_regs[i].regmap_entry[hr]=f_regmap[hr];
7044                       branch_regs[i].wasdirty&=~(1<<hr);
7045                       branch_regs[i].wasdirty|=(1<<hr)&regs[i].dirty;
7046                       branch_regs[i].regmap[hr]=f_regmap[hr];
7047                       branch_regs[i].dirty&=~(1<<hr);
7048                       branch_regs[i].dirty|=(1<<hr)&regs[i].dirty;
7049                       branch_regs[i].wasdoingcp&=~(1<<hr);
7050                       branch_regs[i].isdoingcp&=~(1<<hr);
7051                       if(itype[i]==CJUMP) {
7052                         regmap_pre[i+1][hr]=f_regmap[hr];
7053                         regs[i+1].wasdirty&=~(1<<hr);
7054                         regs[i+1].wasdirty|=(1<<hr)&regs[i].dirty;
7055                       }
7056                       else if(itype[i]!=RJUMP&&itype[i]!=UJUMP) {
7057                         regmap_pre[i+2][hr]=f_regmap[hr];
7058                         regs[i+2].wasdirty&=~(1<<hr);
7059                         regs[i+2].wasdirty|=(1<<hr)&regs[i].dirty;
7060                       }
7061                     }
7062                   }
7063                   for(k=t;k<j;k++) {
7064                     // Alloc register clean at beginning of loop,
7065                     // but may dirty it in pass 6
7066                     regs[k].regmap_entry[hr]=f_regmap[hr];
7067                     regs[k].regmap[hr]=f_regmap[hr];
7068                     regs[k].dirty&=~(1<<hr);
7069                     regs[k].wasdoingcp&=~(1<<hr);
7070                     regs[k].isdoingcp&=~(1<<hr);
7071                     if(itype[k]==UJUMP||itype[k]==RJUMP||itype[k]==SJUMP) {
7072                       branch_regs[k].regmap_entry[hr]=f_regmap[hr];
7073                       branch_regs[k].regmap[hr]=f_regmap[hr];
7074                       branch_regs[k].dirty&=~(1<<hr);
7075                       branch_regs[k].wasdoingcp&=~(1<<hr);
7076                       branch_regs[k].isdoingcp&=~(1<<hr);
7077                       if(itype[k]!=RJUMP&&itype[k]!=UJUMP) {
7078                         regmap_pre[k+2][hr]=f_regmap[hr];
7079                         regs[k+2].wasdirty&=~(1<<hr);
7080                       }
7081                     }
7082                     else
7083                     {
7084                       regmap_pre[k+1][hr]=f_regmap[hr];
7085                       regs[k+1].wasdirty&=~(1<<hr);
7086                     }
7087                   }
7088                   if(regs[j].regmap[hr]==f_regmap[hr])
7089                     regs[j].regmap_entry[hr]=f_regmap[hr];
7090                   break;
7091                 }
7092                 if(j==i) break;
7093                 if(regs[j].regmap[hr]>=0)
7094                   break;
7095                 if(get_reg(regs[j].regmap,f_regmap[hr])>=0) {
7096                   //printf("no-match due to different register\n");
7097                   break;
7098                 }
7099                 if(itype[j]==UJUMP||itype[j]==RJUMP)
7100                 {
7101                   // Stop on unconditional branch
7102                   break;
7103                 }
7104                 if(itype[j]==SJUMP)
7105                 {
7106                   if(ooo[j]) {
7107                     if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1])
7108                       break;
7109                   }else{
7110                     if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1])
7111                       break;
7112                   }
7113                   if(get_reg(branch_regs[j].regmap,f_regmap[hr])>=0) {
7114                     //printf("no-match due to different register (branch)\n");
7115                     break;
7116                   }
7117                 }
7118                 if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
7119                   //printf("No free regs for store %x\n",start+j*4);
7120                   break;
7121                 }
7122               }
7123             }
7124           }
7125         }
7126       }
7127     }else{
7128       // Non branch or undetermined branch target
7129       for(hr=0;hr<HOST_REGS;hr++)
7130       {
7131         if(hr!=EXCLUDE_REG) {
7132           if(regs[i].regmap[hr]>=0) {
7133             if(f_regmap[hr]!=regs[i].regmap[hr]) {
7134               // dealloc old register
7135               int n;
7136               for(n=0;n<HOST_REGS;n++)
7137               {
7138                 if(f_regmap[n]==regs[i].regmap[hr]) {f_regmap[n]=-1;}
7139               }
7140               // and alloc new one
7141               f_regmap[hr]=regs[i].regmap[hr];
7142             }
7143           }
7144         }
7145       }
7146       // Try to restore cycle count at branch targets
7147       if(bt[i]) {
7148         for(j=i;j<slen-1;j++) {
7149           if(regs[j].regmap[HOST_CCREG]!=-1) break;
7150           if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) {
7151             //printf("no free regs for store %x\n",start+j*4);
7152             break;
7153           }
7154         }
7155         if(regs[j].regmap[HOST_CCREG]==CCREG) {
7156           int k=i;
7157           //printf("Extend CC, %x -> %x\n",start+k*4,start+j*4);
7158           while(k<j) {
7159             regs[k].regmap_entry[HOST_CCREG]=CCREG;
7160             regs[k].regmap[HOST_CCREG]=CCREG;
7161             regmap_pre[k+1][HOST_CCREG]=CCREG;
7162             regs[k+1].wasdirty|=1<<HOST_CCREG;
7163             regs[k].dirty|=1<<HOST_CCREG;
7164             regs[k].wasdoingcp&=~(1<<HOST_CCREG);
7165             regs[k].isdoingcp&=~(1<<HOST_CCREG);
7166             k++;
7167           }
7168           regs[j].regmap_entry[HOST_CCREG]=CCREG;
7169         }
7170         // Work backwards from the branch target
7171         if(j>i&&f_regmap[HOST_CCREG]==CCREG)
7172         {
7173           //printf("Extend backwards\n");
7174           int k;
7175           k=i;
7176           while(regs[k-1].regmap[HOST_CCREG]==-1) {
7177             if(count_free_regs(regs[k-1].regmap)<=minimum_free_regs[k-1]) {
7178               //printf("no free regs for store %x\n",start+(k-1)*4);
7179               break;
7180             }
7181             k--;
7182           }
7183           if(regs[k-1].regmap[HOST_CCREG]==CCREG) {
7184             //printf("Extend CC, %x ->\n",start+k*4);
7185             while(k<=i) {
7186               regs[k].regmap_entry[HOST_CCREG]=CCREG;
7187               regs[k].regmap[HOST_CCREG]=CCREG;
7188               regmap_pre[k+1][HOST_CCREG]=CCREG;
7189               regs[k+1].wasdirty|=1<<HOST_CCREG;
7190               regs[k].dirty|=1<<HOST_CCREG;
7191               regs[k].wasdoingcp&=~(1<<HOST_CCREG);
7192               regs[k].isdoingcp&=~(1<<HOST_CCREG);
7193               k++;
7194             }
7195           }
7196           else {
7197             //printf("Fail Extend CC, %x ->\n",start+k*4);
7198           }
7199         }
7200       }
7201       // Don't try to add registers to complex instructions like MAC, division, etc.
7202       if(itype[i]!=STORE&&itype[i]!=RMW&&itype[i]!=PCREL&&
7203          itype[i]!=NOP&&itype[i]!=MOV&&itype[i]!=ALU&&itype[i]!=SHIFTIMM&&
7204          itype[i]!=IMM8&&itype[i]!=LOAD&&itype[i]!=EXT&&itype[i]!=FLAGS)
7205       {
7206         memcpy(f_regmap,regs[i].regmap,sizeof(f_regmap));
7207       }
7208     }
7209   }
7210 
7211   // Cache memory_map pointer if a register is available
7212   #ifndef HOST_IMM_ADDR32
7213   {
7214     int earliest_available[HOST_REGS];
7215     int loop_start[HOST_REGS];
7216     int score[HOST_REGS];
7217     int end[HOST_REGS];
7218     int reg=MMREG;
7219 
7220     // Init
7221     for(hr=0;hr<HOST_REGS;hr++) {
7222       score[hr]=0;earliest_available[hr]=0;
7223       loop_start[hr]=MAXBLOCK;
7224     }
7225     for(i=0;i<slen-1;i++)
7226     {
7227       // Can't do anything if no registers are available
7228       if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i]) {
7229         for(hr=0;hr<HOST_REGS;hr++) {
7230           score[hr]=0;earliest_available[hr]=i+1;
7231           loop_start[hr]=MAXBLOCK;
7232         }
7233       }
7234       if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==SJUMP) {
7235         if(!ooo[i]) {
7236           if(count_free_regs(branch_regs[i].regmap)<=minimum_free_regs[i+1]) {
7237             for(hr=0;hr<HOST_REGS;hr++) {
7238               score[hr]=0;earliest_available[hr]=i+1;
7239               loop_start[hr]=MAXBLOCK;
7240             }
7241           }
7242         }else{
7243           if(count_free_regs(regs[i].regmap)<=minimum_free_regs[i+1]) {
7244             for(hr=0;hr<HOST_REGS;hr++) {
7245               score[hr]=0;earliest_available[hr]=i+1;
7246               loop_start[hr]=MAXBLOCK;
7247             }
7248           }
7249         }
7250       }
7251       // Mark unavailable registers
7252       for(hr=0;hr<HOST_REGS;hr++) {
7253         if(regs[i].regmap[hr]>=0) {
7254           score[hr]=0;earliest_available[hr]=i+1;
7255           loop_start[hr]=MAXBLOCK;
7256         }
7257         if(itype[i]==UJUMP||itype[i]==RJUMP||itype[i]==SJUMP) {
7258           if(branch_regs[i].regmap[hr]>=0) {
7259             score[hr]=0;earliest_available[hr]=i+2;
7260             loop_start[hr]=MAXBLOCK;
7261           }
7262         }
7263       }
7264       // No register allocations after unconditional jumps
7265       if(itype[i]==UJUMP||itype[i]==RJUMP)
7266       {
7267         for(hr=0;hr<HOST_REGS;hr++) {
7268           score[hr]=0;earliest_available[hr]=i+2;
7269           loop_start[hr]=MAXBLOCK;
7270         }
7271         i++; // Skip delay slot too
7272         //printf("skip delay slot: %x\n",start+i*4);
7273       }
7274       else
7275       // Possible match
7276       if(itype[i]==LOAD||itype[i]==STORE||itype[i]==RMW) {
7277         for(hr=0;hr<HOST_REGS;hr++) {
7278           if(hr!=EXCLUDE_REG) {
7279             end[hr]=i-1;
7280             for(j=i;j<slen-1;j++) {
7281               if(regs[j].regmap[hr]>=0) break;
7282               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==SJUMP) {
7283                 if(branch_regs[j].regmap[hr]>=0) break;
7284                 if(ooo[j]) {
7285                   if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j+1]) break;
7286                 }else{
7287                   if(count_free_regs(branch_regs[j].regmap)<=minimum_free_regs[j+1]) break;
7288                 }
7289               }
7290               else if(count_free_regs(regs[j].regmap)<=minimum_free_regs[j]) break;
7291               if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP) {
7292                 int t=(ba[j]-start)>>1;
7293                 if(t<j&&t>=earliest_available[hr]) {
7294                   if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=PR)) { // call/ret assumes no registers allocated
7295                     // Score a point for hoisting loop invariant
7296                     if(t<loop_start[hr]) loop_start[hr]=t;
7297                     //printf("set loop_start: i=%x j=%x (%x)\n",start+i*2,start+j*2,start+t*2);
7298                     score[hr]++;
7299                     end[hr]=j;
7300                   }
7301                 }
7302                 else if(t<j) {
7303                   if(regs[t].regmap[hr]==reg) {
7304                     // Score a point if the branch target matches this register
7305                     score[hr]++;
7306                     end[hr]=j;
7307                   }
7308                 }
7309                 if(itype[j+1]==LOAD||itype[j+1]==STORE||itype[j+1]==RMW) {
7310                   score[hr]++;
7311                   end[hr]=j;
7312                 }
7313               }
7314               if(itype[j]==UJUMP||itype[j]==RJUMP)
7315               {
7316                 // Stop on unconditional branch
7317                 break;
7318               }
7319               else
7320               if(itype[j]==LOAD||itype[j]==STORE||itype[j]==RMW) {
7321                 score[hr]++;
7322                 end[hr]=j;
7323               }
7324             }
7325           }
7326         }
7327         // Find highest score and allocate that register
7328         int maxscore=0;
7329         for(hr=0;hr<HOST_REGS;hr++) {
7330           if(hr!=EXCLUDE_REG) {
7331             if(score[hr]>score[maxscore]) {
7332               maxscore=hr;
7333               //printf("highest score: %d %d (%x->%x)\n",score[hr],hr,start+i*2,start+end[hr]*2);
7334             }
7335           }
7336         }
7337         if(score[maxscore]>1)
7338         {
7339           if(i<loop_start[maxscore]) loop_start[maxscore]=i;
7340           for(j=loop_start[maxscore];j<slen&&j<=end[maxscore];j++) {
7341             //if(regs[j].regmap[maxscore]>=0) {printf("oops: %x %x was %d=%d\n",loop_start[maxscore]*2+start,j*2+start,maxscore,regs[j].regmap[maxscore]);}
7342             assert(regs[j].regmap[maxscore]<0);
7343             if(j>loop_start[maxscore]) regs[j].regmap_entry[maxscore]=reg;
7344             regs[j].regmap[maxscore]=reg;
7345             regs[j].dirty&=~(1<<maxscore);
7346             regs[j].wasdoingcp&=~(1<<maxscore);
7347             regs[j].isdoingcp&=~(1<<maxscore);
7348             if(itype[j]==UJUMP||itype[j]==RJUMP||itype[j]==CJUMP||itype[j]==SJUMP) {
7349               if(itype[j]!=CJUMP) {
7350                 branch_regs[j].regmap[maxscore]=reg;
7351                 branch_regs[j].wasdirty&=~(1<<maxscore);
7352                 branch_regs[j].dirty&=~(1<<maxscore);
7353                 branch_regs[j].wasdoingcp&=~(1<<maxscore);
7354                 branch_regs[j].isdoingcp&=~(1<<maxscore);
7355                 if(itype[j]==SJUMP) {
7356                   regmap_pre[j+2][maxscore]=reg;
7357                   regs[j+2].wasdirty&=~(1<<maxscore);
7358                 }
7359               }
7360               else { // if(itype[j]==CJUMP)
7361                 regmap_pre[j+1][maxscore]=reg;
7362                 regs[j+1].wasdirty&=~(1<<maxscore);
7363               }
7364               // loop optimization (loop_preload)
7365               int t=(ba[j]-start)>>1;
7366               if(t==loop_start[maxscore]) {
7367                 if(t==1||(t>1&&itype[t-2]!=UJUMP&&itype[t-2]!=RJUMP)||(t>1&&rt1[t-2]!=PR)) // call/ret assumes no registers allocated
7368                   regs[t].regmap_entry[maxscore]=reg;
7369               }
7370             }
7371             else
7372             {
7373               if(j<1||(itype[j-1]!=RJUMP&&itype[j-1]!=UJUMP&&itype[j-1]!=SJUMP)) {
7374                 regmap_pre[j+1][maxscore]=reg;
7375                 regs[j+1].wasdirty&=~(1<<maxscore);
7376               }
7377             }
7378           }
7379           i=j-1;
7380           if(itype[j-1]==RJUMP||itype[j-1]==UJUMP||itype[j-1]==SJUMP) i++; // skip delay slot
7381           for(hr=0;hr<HOST_REGS;hr++) {
7382             score[hr]=0;earliest_available[hr]=i+i;
7383             loop_start[hr]=MAXBLOCK;
7384           }
7385         }
7386       }
7387     }
7388   }
7389   #endif
7390 
7391   // This allocates registers (if possible) one instruction prior
7392   // to use, which can avoid a load-use penalty on certain CPUs.
7393   for(i=0;i<slen-1;i++)
7394   {
7395     if(!i||(itype[i-1]!=UJUMP&&itype[i-1]!=CJUMP&&itype[i-1]!=SJUMP&&itype[i-1]!=RJUMP))
7396     {
7397       if(!bt[i+1])
7398       {
7399         if(itype[i]==LOAD||itype[i]==PCREL||itype[i]==MOV||itype[i]==ALU||itype[i]==SHIFTIMM||itype[i]==IMM8||itype[i]==EXT||itype[i]==FLAGS)
7400         {
7401           if(rs1[i+1]>=0) {
7402             if((hr=get_reg(regs[i+1].regmap,rs1[i+1]==TBIT?SR:rs1[i+1]))>=0)
7403             {
7404               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0
7405                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7406               {
7407                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
7408                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
7409                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
7410                 regs[i].isdoingcp&=~(1<<hr);
7411                 regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7412                 cpmap[i][hr]=cpmap[i+1][hr];
7413                 regs[i+1].wasdirty&=~(1<<hr);
7414                 regs[i].dirty&=~(1<<hr);
7415               }
7416             }
7417           }
7418           if(rs2[i+1]>=0) {
7419             if((hr=get_reg(regs[i+1].regmap,rs2[i+1]==TBIT?SR:rs2[i+1]))>=0)
7420             {
7421               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0
7422                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7423               {
7424                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
7425                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
7426                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
7427                 regs[i].isdoingcp&=~(1<<hr);
7428                 regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7429                 cpmap[i][hr]=cpmap[i+1][hr];
7430                 regs[i+1].wasdirty&=~(1<<hr);
7431                 regs[i].dirty&=~(1<<hr);
7432               }
7433             }
7434           }
7435           if(rs3[i+1]>=0) {
7436             if((hr=get_reg(regs[i+1].regmap,rs3[i+1]==TBIT?SR:rs3[i+1]))>=0)
7437             {
7438               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0
7439                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7440               {
7441                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
7442                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
7443                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
7444                 regs[i].isdoingcp&=~(1<<hr);
7445                 regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7446                 cpmap[i][hr]=cpmap[i+1][hr];
7447                 regs[i+1].wasdirty&=~(1<<hr);
7448                 regs[i].dirty&=~(1<<hr);
7449               }
7450             }
7451           }
7452           if(rt1[i+1]==TBIT||rt2[i+1]==TBIT) {
7453             if(rt1[i+1]!=SR&&rt2[i+1]!=SR)
7454             if((hr=get_reg(regs[i+1].regmap,SR))>=0)
7455             {
7456               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0
7457                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7458               {
7459                 regs[i].regmap[hr]=regs[i+1].regmap[hr];
7460                 regmap_pre[i+1][hr]=regs[i+1].regmap[hr];
7461                 regs[i+1].regmap_entry[hr]=regs[i+1].regmap[hr];
7462                 regs[i].isdoingcp&=~(1<<hr);
7463                 regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7464                 cpmap[i][hr]=cpmap[i+1][hr];
7465                 regs[i+1].wasdirty&=~(1<<hr);
7466                 regs[i].dirty&=~(1<<hr);
7467               }
7468             }
7469           }
7470           // Preload target address for load instruction (non-constant)
7471           if(itype[i+1]==LOAD&&rs1[i+1]>=0&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
7472             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
7473             {
7474               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0
7475                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7476               {
7477                 regs[i].regmap[hr]=rs1[i+1];
7478                 regmap_pre[i+1][hr]=rs1[i+1];
7479                 regs[i+1].regmap_entry[hr]=rs1[i+1];
7480                 regs[i].isdoingcp&=~(1<<hr);
7481                 regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7482                 cpmap[i][hr]=cpmap[i+1][hr];
7483                 regs[i+1].wasdirty&=~(1<<hr);
7484                 regs[i].dirty&=~(1<<hr);
7485               }
7486             }
7487           }
7488           #if 0
7489           // Load source into target register (not implemented)
7490           if(lt1[i+1]>=0&&get_reg(regs[i+1].regmap,rs1[i+1])<0) {
7491             if((hr=get_reg(regs[i+1].regmap,rt1[i+1]))>=0)
7492             {
7493               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0
7494                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7495               {
7496                 regs[i].regmap[hr]=rs1[i+1];
7497                 regmap_pre[i+1][hr]=rs1[i+1];
7498                 regs[i+1].regmap_entry[hr]=rs1[i+1];
7499                 regs[i].isdoingcp&=~(1<<hr);
7500                 regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7501                 cpmap[i][hr]=cpmap[i+1][hr];
7502                 regs[i+1].wasdirty&=~(1<<hr);
7503                 regs[i].dirty&=~(1<<hr);
7504               }
7505             }
7506           }
7507           #endif
7508           #ifndef HOST_IMM_ADDR32
7509           // Preload map address
7510           if(itype[i+1]==LOAD||itype[i+1]==STORE) {
7511             hr=get_reg(regs[i+1].regmap,MOREG);
7512             if(hr>=0) {
7513               int sr=get_reg(regs[i+1].regmap,rs1[i+1]);
7514               if(sr>=0&&((regs[i+1].wasdoingcp>>sr)&1)
7515                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i]) {
7516                 int nr;
7517                 if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0)
7518                 {
7519                   regs[i].regmap[hr]=MGEN1+((i+1)&1);
7520                   regmap_pre[i+1][hr]=MGEN1+((i+1)&1);
7521                   regs[i+1].regmap_entry[hr]=MGEN1+((i+1)&1);
7522                   regs[i].isdoingcp&=~(1<<hr);
7523                   regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7524                   cpmap[i][hr]=cpmap[i+1][hr];
7525                   regs[i+1].wasdirty&=~(1<<hr);
7526                   regs[i].dirty&=~(1<<hr);
7527                 }
7528                 else if((nr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1))>=0)
7529                 {
7530                   // move it to another register
7531                   regs[i+1].regmap[hr]=-1;
7532                   regmap_pre[i+2][hr]=-1;
7533                   regs[i+1].regmap[nr]=MOREG;
7534                   regmap_pre[i+2][nr]=MOREG;
7535                   regs[i].regmap[nr]=MGEN1+((i+1)&1);
7536                   regmap_pre[i+1][nr]=MGEN1+((i+1)&1);
7537                   regs[i+1].regmap_entry[nr]=MGEN1+((i+1)&1);
7538                   regs[i].isdoingcp&=~(1<<nr);
7539                   regs[i+1].isdoingcp&=~(1<<nr);
7540                   regs[i].dirty&=~(1<<nr);
7541                   regs[i+1].wasdirty&=~(1<<nr);
7542                   regs[i+1].dirty&=~(1<<nr);
7543                   regs[i+2].wasdirty&=~(1<<nr);
7544                 }
7545               }
7546             }
7547           }
7548           #endif
7549           // Address for store instruction (non-constant)
7550           if(itype[i+1]==STORE) {
7551             if(get_reg(regs[i+1].regmap,rs1[i+1])<0) {
7552               hr=get_reg2(regs[i].regmap,regs[i+1].regmap,-1);
7553               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
7554               else {regs[i+1].regmap[hr]=AGEN1+((i+1)&1);regs[i+1].isdoingcp&=~(1<<hr);}
7555               assert(hr>=0);
7556               if(regs[i].regmap[hr]<0&&regs[i+1].regmap_entry[hr]<0
7557                  &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7558               {
7559                 regs[i].regmap[hr]=rs1[i+1];
7560                 regmap_pre[i+1][hr]=rs1[i+1];
7561                 regs[i+1].regmap_entry[hr]=rs1[i+1];
7562                 regs[i].isdoingcp&=~(1<<hr);
7563                 regs[i].isdoingcp|=regs[i+1].isdoingcp&(1<<hr);
7564                 cpmap[i][hr]=cpmap[i+1][hr];
7565                 regs[i+1].wasdirty&=~(1<<hr);
7566                 regs[i].dirty&=~(1<<hr);
7567               }
7568             }
7569           }
7570           // Load/store address (constant)
7571           if(itype[i+1]==LOAD||itype[i+1]==STORE) {
7572             if(itype[i+1]==LOAD)
7573               hr=get_reg(regs[i+1].regmap,rt1[i+1]);
7574             if(itype[i+1]==STORE) {
7575               hr=get_reg(regs[i+1].regmap,AGEN1+((i+1)&1));
7576               if(hr<0) hr=get_reg(regs[i+1].regmap,-1);
7577             }
7578             if(hr>=0&&regs[i].regmap[hr]<0
7579                &&count_free_regs(regs[i].regmap)>minimum_free_regs[i])
7580             {
7581               int rs=get_reg(regs[i+1].regmap,rs1[i+1]);
7582               if(rs>=0&&((regs[i+1].wasdoingcp>>rs)&1)) {
7583                 regs[i].regmap[hr]=AGEN1+((i+1)&1);
7584                 regmap_pre[i+1][hr]=AGEN1+((i+1)&1);
7585                 regs[i+1].regmap_entry[hr]=AGEN1+((i+1)&1);
7586                 regs[i].isdoingcp&=~(1<<hr);
7587                 regs[i+1].wasdirty&=~(1<<hr);
7588                 regs[i].dirty&=~(1<<hr);
7589               }
7590             }
7591           }
7592         }
7593       }
7594     }
7595   }
7596   }
7597 
7598   /* Pass 6 - Optimize clean/dirty state */
7599   clean_registers(0,slen-1,1);
7600 
7601   /* Pass 7 - Identify interrupt return locations */
7602 
7603   for (i=slen-1;i>=0;i--)
7604   {
7605     if(itype[i]==CJUMP||itype[i]==SJUMP)
7606     {
7607       // Avoid unnecessary constant propagation
7608       int hr;
7609       u32 sregs;
7610       for(hr=0;hr<HOST_REGS;hr++) {
7611         if(hr!=EXCLUDE_REG) {
7612           if(regs[i].regmap_entry[hr]>=0) {
7613             if(itype[i]==SJUMP) {
7614               if(regs[i].regmap_entry[hr]==rs1[i+1]) continue;
7615               if(regs[i].regmap_entry[hr]==rs2[i+1]) continue;
7616               if(regs[i].regmap_entry[hr]==rs3[i+1]) continue;
7617               if(regs[i].regmap_entry[hr]==rt1[i+1]) continue;
7618               if(regs[i].regmap_entry[hr]==rt2[i+1]) continue;
7619             }
7620             if(i>0) {
7621               if(regs[i].regmap_entry[hr]==rs1[i-1]) continue;
7622               if(regs[i].regmap_entry[hr]==rs2[i-1]) continue;
7623               if(regs[i].regmap_entry[hr]==rs3[i-1]) continue;
7624               if(regs[i].regmap_entry[hr]==rt1[i-1]) continue;
7625               if(regs[i].regmap_entry[hr]==rt2[i-1]) continue;
7626             }
7627             //if(regs[i].wasdoingcp&(1<<hr)) printf("drop wcp: %x\n",start+i*2);
7628             //if(regs[i].isdoingcp&(1<<hr)) printf("drop icp: %x\n",start+i*2);
7629             regs[i].wasdoingcp&=~(1<<hr);
7630             regs[i].isdoingcp&=~(1<<hr);
7631           }
7632         }
7633       }
7634       sregs=0;
7635       if(itype[i]==SJUMP)
7636       {
7637         // Don't intervene if constant propagation is being performed
7638         // on a register used by an instruction in the delay slot
7639         if(itype[i+1]==LOAD) {
7640           if(rs1[i+1]>=0) sregs|=1<<rs1[i+1];
7641           if(rs2[i+1]>=0) sregs|=1<<rs2[i+1];
7642         }
7643         if(itype[i+1]==STORE) {
7644           if(rs2[i+1]>=0) sregs|=1<<rs2[i+1];
7645           if(rs3[i+1]>=0) sregs|=1<<rs3[i+1];
7646         }
7647       }
7648       // If no constant propagation is being done, mark this address as a
7649       // branch target since it may be called upon return from interrupt
7650       if(!regs[i].wasdoingcp&&!(regs[i].isconst&sregs))
7651         bt[i]=1;
7652     }
7653   }
7654 
7655   /* Debug/disassembly */
7656   if((void*)assem_debug==(void*)printf)
7657   for(i=0;i<slen;i++)
7658   {
7659     int r;
7660     printf("U:");
7661     for(r=0;r<=CCREG;r++) {
7662       if((unneeded_reg[i]>>r)&1) {
7663         if(r==SR) printf(" SR(16)");
7664         else if(r==GBR) printf(" GBR(17)");
7665         else if(r==VBR) printf(" VBR(18)");
7666         else if(r==MACH) printf(" MACH(19)");
7667         else if(r==MACL) printf(" MACL(20)");
7668         else if(r==PR) printf(" PR(21)");
7669         else if(r==TBIT) printf(" T(22)");
7670         else printf(" r%d",r);
7671       }
7672     }
7673     printf("\n");
7674     #if defined(__i386__) || defined(__x86_64__)
7675     printf("pre: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7]);
7676     #endif
7677     #ifdef __arm__
7678     printf("pre: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regmap_pre[i][0],regmap_pre[i][1],regmap_pre[i][2],regmap_pre[i][3],regmap_pre[i][4],regmap_pre[i][5],regmap_pre[i][6],regmap_pre[i][7],regmap_pre[i][8],regmap_pre[i][9],regmap_pre[i][10],regmap_pre[i][12]);
7679     #endif
7680     printf("needs: ");
7681     if(needed_reg[i]&1) printf("eax ");
7682     if((needed_reg[i]>>1)&1) printf("ecx ");
7683     if((needed_reg[i]>>2)&1) printf("edx ");
7684     if((needed_reg[i]>>3)&1) printf("ebx ");
7685     if((needed_reg[i]>>5)&1) printf("ebp ");
7686     if((needed_reg[i]>>6)&1) printf("esi ");
7687     if((needed_reg[i]>>7)&1) printf("edi ");
7688     printf("\n");
7689     #if defined(__i386__) || defined(__x86_64__)
7690     printf("entry: eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7]);
7691     printf("dirty: ");
7692     if(regs[i].wasdirty&1) printf("eax ");
7693     if((regs[i].wasdirty>>1)&1) printf("ecx ");
7694     if((regs[i].wasdirty>>2)&1) printf("edx ");
7695     if((regs[i].wasdirty>>3)&1) printf("ebx ");
7696     if((regs[i].wasdirty>>5)&1) printf("ebp ");
7697     if((regs[i].wasdirty>>6)&1) printf("esi ");
7698     if((regs[i].wasdirty>>7)&1) printf("edi ");
7699     #endif
7700     #ifdef __arm__
7701     printf("entry: r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d\n",regs[i].regmap_entry[0],regs[i].regmap_entry[1],regs[i].regmap_entry[2],regs[i].regmap_entry[3],regs[i].regmap_entry[4],regs[i].regmap_entry[5],regs[i].regmap_entry[6],regs[i].regmap_entry[7],regs[i].regmap_entry[8],regs[i].regmap_entry[9],regs[i].regmap_entry[10],regs[i].regmap_entry[12]);
7702     printf("dirty: ");
7703     if(regs[i].wasdirty&1) printf("r0 ");
7704     if((regs[i].wasdirty>>1)&1) printf("r1 ");
7705     if((regs[i].wasdirty>>2)&1) printf("r2 ");
7706     if((regs[i].wasdirty>>3)&1) printf("r3 ");
7707     if((regs[i].wasdirty>>4)&1) printf("r4 ");
7708     if((regs[i].wasdirty>>5)&1) printf("r5 ");
7709     if((regs[i].wasdirty>>6)&1) printf("r6 ");
7710     if((regs[i].wasdirty>>7)&1) printf("r7 ");
7711     if((regs[i].wasdirty>>8)&1) printf("r8 ");
7712     if((regs[i].wasdirty>>9)&1) printf("r9 ");
7713     if((regs[i].wasdirty>>10)&1) printf("r10 ");
7714     if((regs[i].wasdirty>>12)&1) printf("r12 ");
7715     #endif
7716     printf("ccadj=%d",ccadj[i]);
7717     printf("\n");
7718     disassemble_inst(i);
7719     //printf ("ccadj[%d] = %d\n",i,ccadj[i]);
7720     #if defined(__i386__) || defined(__x86_64__)
7721     printf("eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7]);
7722     if(regs[i].dirty&1) printf("eax ");
7723     if((regs[i].dirty>>1)&1) printf("ecx ");
7724     if((regs[i].dirty>>2)&1) printf("edx ");
7725     if((regs[i].dirty>>3)&1) printf("ebx ");
7726     if((regs[i].dirty>>5)&1) printf("ebp ");
7727     if((regs[i].dirty>>6)&1) printf("esi ");
7728     if((regs[i].dirty>>7)&1) printf("edi ");
7729     #endif
7730     #ifdef __arm__
7731     printf("r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",regs[i].regmap[0],regs[i].regmap[1],regs[i].regmap[2],regs[i].regmap[3],regs[i].regmap[4],regs[i].regmap[5],regs[i].regmap[6],regs[i].regmap[7],regs[i].regmap[8],regs[i].regmap[9],regs[i].regmap[10],regs[i].regmap[12]);
7732     if(regs[i].dirty&1) printf("r0 ");
7733     if((regs[i].dirty>>1)&1) printf("r1 ");
7734     if((regs[i].dirty>>2)&1) printf("r2 ");
7735     if((regs[i].dirty>>3)&1) printf("r3 ");
7736     if((regs[i].dirty>>4)&1) printf("r4 ");
7737     if((regs[i].dirty>>5)&1) printf("r5 ");
7738     if((regs[i].dirty>>6)&1) printf("r6 ");
7739     if((regs[i].dirty>>7)&1) printf("r7 ");
7740     if((regs[i].dirty>>8)&1) printf("r8 ");
7741     if((regs[i].dirty>>9)&1) printf("r9 ");
7742     if((regs[i].dirty>>10)&1) printf("r10 ");
7743     if((regs[i].dirty>>12)&1) printf("r12 ");
7744     #endif
7745     printf("\n");
7746     if(regs[i].isdoingcp) {
7747       printf("constants: ");
7748       #if defined(__i386__) || defined(__x86_64__)
7749       if(regs[i].isdoingcp&1) printf("eax=%x ",(int)cpmap[i][0]);
7750       if((regs[i].isdoingcp>>1)&1) printf("ecx=%x ",(int)cpmap[i][1]);
7751       if((regs[i].isdoingcp>>2)&1) printf("edx=%x ",(int)cpmap[i][2]);
7752       if((regs[i].isdoingcp>>3)&1) printf("ebx=%x ",(int)cpmap[i][3]);
7753       if((regs[i].isdoingcp>>5)&1) printf("ebp=%x ",(int)cpmap[i][5]);
7754       if((regs[i].isdoingcp>>6)&1) printf("esi=%x ",(int)cpmap[i][6]);
7755       if((regs[i].isdoingcp>>7)&1) printf("edi=%x ",(int)cpmap[i][7]);
7756       #endif
7757       #ifdef __arm__
7758       if(regs[i].isdoingcp&1) printf("r0=%x ",(int)cpmap[i][0]);
7759       if((regs[i].isdoingcp>>1)&1) printf("r1=%x ",(int)cpmap[i][1]);
7760       if((regs[i].isdoingcp>>2)&1) printf("r2=%x ",(int)cpmap[i][2]);
7761       if((regs[i].isdoingcp>>3)&1) printf("r3=%x ",(int)cpmap[i][3]);
7762       if((regs[i].isdoingcp>>4)&1) printf("r4=%x ",(int)cpmap[i][4]);
7763       if((regs[i].isdoingcp>>5)&1) printf("r5=%x ",(int)cpmap[i][5]);
7764       if((regs[i].isdoingcp>>6)&1) printf("r6=%x ",(int)cpmap[i][6]);
7765       if((regs[i].isdoingcp>>7)&1) printf("r7=%x ",(int)cpmap[i][7]);
7766       if((regs[i].isdoingcp>>8)&1) printf("r8=%x ",(int)cpmap[i][8]);
7767       if((regs[i].isdoingcp>>9)&1) printf("r9=%x ",(int)cpmap[i][9]);
7768       if((regs[i].isdoingcp>>10)&1) printf("r10=%x ",(int)cpmap[i][10]);
7769       if((regs[i].isdoingcp>>12)&1) printf("r12=%x ",(int)cpmap[i][12]);
7770       #endif
7771       printf("\n");
7772     }
7773     if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==CJUMP||itype[i]==SJUMP) {
7774       #if defined(__i386__) || defined(__x86_64__)
7775       printf("branch(%d): eax=%d ecx=%d edx=%d ebx=%d ebp=%d esi=%d edi=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7]);
7776       if(branch_regs[i].dirty&1) printf("eax ");
7777       if((branch_regs[i].dirty>>1)&1) printf("ecx ");
7778       if((branch_regs[i].dirty>>2)&1) printf("edx ");
7779       if((branch_regs[i].dirty>>3)&1) printf("ebx ");
7780       if((branch_regs[i].dirty>>5)&1) printf("ebp ");
7781       if((branch_regs[i].dirty>>6)&1) printf("esi ");
7782       if((branch_regs[i].dirty>>7)&1) printf("edi ");
7783       #endif
7784       #ifdef __arm__
7785       printf("branch(%d): r0=%d r1=%d r2=%d r3=%d r4=%d r5=%d r6=%d r7=%d r8=%d r9=%d r10=%d r12=%d dirty: ",i,branch_regs[i].regmap[0],branch_regs[i].regmap[1],branch_regs[i].regmap[2],branch_regs[i].regmap[3],branch_regs[i].regmap[4],branch_regs[i].regmap[5],branch_regs[i].regmap[6],branch_regs[i].regmap[7],branch_regs[i].regmap[8],branch_regs[i].regmap[9],branch_regs[i].regmap[10],branch_regs[i].regmap[12]);
7786       if(branch_regs[i].dirty&1) printf("r0 ");
7787       if((branch_regs[i].dirty>>1)&1) printf("r1 ");
7788       if((branch_regs[i].dirty>>2)&1) printf("r2 ");
7789       if((branch_regs[i].dirty>>3)&1) printf("r3 ");
7790       if((branch_regs[i].dirty>>4)&1) printf("r4 ");
7791       if((branch_regs[i].dirty>>5)&1) printf("r5 ");
7792       if((branch_regs[i].dirty>>6)&1) printf("r6 ");
7793       if((branch_regs[i].dirty>>7)&1) printf("r7 ");
7794       if((branch_regs[i].dirty>>8)&1) printf("r8 ");
7795       if((branch_regs[i].dirty>>9)&1) printf("r9 ");
7796       if((branch_regs[i].dirty>>10)&1) printf("r10 ");
7797       if((branch_regs[i].dirty>>12)&1) printf("r12 ");
7798       #endif
7799       printf("\n");
7800     }
7801   }
7802 
7803   /* Pass 8 - Assembly */
7804   {
7805   u32 dirty_pre=0;
7806   linkcount=0;stubcount=0;
7807   ds=0;is_delayslot=0;
7808   beginning=(pointer)out;
7809   for(i=0;i<slen;i++)
7810   {
7811     //if(ds) printf("ds: ");
7812     if((void*)assem_debug==(void*)printf) disassemble_inst(i);
7813     if(ds) {
7814       ds=0; // Skip delay slot
7815       if(bt[i]) assem_debug("OOPS - branch into delay slot\n");
7816       instr_addr[i]=0;
7817     } else {
7818       int srloaded;
7819 
7820       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&itype[i-1]!=DATA))
7821       {
7822         wb_valid(regmap_pre[i],regs[i].regmap_entry,dirty_pre,regs[i].wasdirty,
7823               unneeded_reg[i]);
7824       }
7825       if(itype[i]==SJUMP) dirty_pre=branch_regs[i].dirty;
7826       else dirty_pre=regs[i].dirty;
7827       // write back
7828       if(i<2||(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&itype[i-1]!=DATA))
7829       {
7830         wb_invalidate(regmap_pre[i],regs[i].regmap_entry,regs[i].wasdirty,
7831                       unneeded_reg[i]);
7832         loop_preload(regmap_pre[i],regs[i].regmap_entry);
7833       }
7834       // branch target entry point
7835       instr_addr[i]=(pointer)out;
7836       assem_debug("<->\n");
7837       // load regs
7838       if(regs[i].regmap_entry[HOST_CCREG]==CCREG&&regs[i].regmap[HOST_CCREG]!=CCREG)
7839         wb_register(CCREG,regs[i].regmap_entry,regs[i].wasdirty);
7840       load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i],rs2[i],rs3[i]);
7841       srloaded=(rs1[i]==TBIT||rs2[i]==TBIT||rs3[i]==TBIT||rs1[i]==SR||rs2[i]==SR||rs3[i]==SR);
7842       if(rt1[i]==TBIT||rt2[i]==TBIT)
7843         if(!srloaded&&rt1[i]!=SR&&rt2[i]!=SR)
7844           {srloaded=1;load_regs(regs[i].regmap_entry,regs[i].regmap,SR,SR,SR);}
7845       address_generation(i,&regs[i],regs[i].regmap_entry);
7846       load_consts(regmap_pre[i],regs[i].regmap,i);
7847       if(itype[i]==RJUMP||itype[i]==UJUMP||itype[i]==SJUMP)
7848       {
7849         // Load the delay slot registers if necessary
7850         if(!srloaded&&rt1[i+1]!=SR&&rt2[i+1]!=SR&&(rt1[i+1]==TBIT||rt2[i+1]==TBIT))
7851           {srloaded=1;load_regs(regs[i].regmap_entry,regs[i].regmap,SR,SR,SR);}
7852 
7853         if(rs1[i+1]!=rs1[i]&&rs1[i+1]!=rs2[i]&&rs1[i+1]!=rs3[i])
7854           if(!srloaded||(rs1[i+1]!=TBIT&&rs1[i+1]!=SR))
7855             load_regs(regs[i].regmap_entry,regs[i].regmap,rs1[i+1],rs1[i+1],rs1[i+1]);
7856 
7857         if(rs2[i+1]!=rs1[i+1]&&rs2[i+1]!=rs1[i]&&rs2[i+1]!=rs2[i]&&rs2[i+1]!=rs3[i])
7858           if(!srloaded||(rs2[i+1]!=TBIT&&rs2[i+1]!=SR))
7859             load_regs(regs[i].regmap_entry,regs[i].regmap,rs2[i+1],rs2[i+1],rs2[i+1]);
7860 
7861         if(rs3[i+1]!=rs1[i+1]&&rs3[i+1]!=rs2[i+1]&&rs3[i+1]!=rs1[i]&&rs3[i+1]!=rs2[i]&&rs3[i+1]!=rs3[i])
7862           if(!srloaded||(rs3[i+1]!=TBIT&&rs3[i+1]!=SR))
7863             load_regs(regs[i].regmap_entry,regs[i].regmap,rs3[i+1],rs3[i+1],rs3[i+1]);
7864       }
7865       else if(i+1<slen)
7866       {
7867         signed char preload1, preload2, preload3;
7868         // Preload registers for following instruction
7869         preload1=rs1[i+1];
7870         if(preload1==TBIT||preload1==SR) {
7871           if(!srloaded) {preload1=SR;srloaded=1;}
7872           else preload1=-1;
7873         }
7874         if(preload1!=rs1[i]&&preload1!=rs2[i]&&preload1!=rs3[i])
7875           if(preload1!=rt1[i]&&preload1!=rt2[i])
7876             load_regs(regs[i].regmap_entry,regs[i].regmap,preload1,preload1,preload1);
7877         preload2=rs2[i+1];
7878         if(preload2==TBIT||preload2==SR) {
7879           if(!srloaded) {preload2=SR;srloaded=1;}
7880           else preload2=-1;
7881         }
7882         if(preload2!=rs1[i+1]&&preload2!=rs1[i]&&preload2!=rs2[i]&&preload2!=rs3[i])
7883           if(preload2!=rt1[i]&&preload2!=rt2[i])
7884             load_regs(regs[i].regmap_entry,regs[i].regmap,preload2,preload2,preload2);
7885         preload3=rs3[i+1];
7886         if(preload3==TBIT||preload3==SR) {
7887           if(!srloaded) {preload3=SR;srloaded=1;}
7888           else preload3=-1;
7889         }
7890         if(preload3!=rs1[i+1]&&preload3!=rs2[i+1]&&preload3!=rs1[i]&&preload3!=rs2[i]&&preload3!=rs3[i])
7891           if(preload3!=rt1[i]&&preload3!=rt2[i])
7892             load_regs(regs[i].regmap_entry,regs[i].regmap,preload3,preload3,preload3);
7893         if(rt1[i+1]==TBIT||rt2[i+1]==TBIT)
7894           if(!srloaded&&rt1[i]!=SR&&rt2[i]!=SR&&rt1[i+1]!=SR&&rt2[i+1]!=SR)
7895             {srloaded=1;load_regs(regs[i].regmap_entry,regs[i].regmap,SR,SR,SR);}
7896       }
7897       // TODO: if(is_ooo(i)) address_generation(i+1);
7898       if(itype[i]==LOAD||itype[i]==STORE||itype[i]==RMW)
7899         load_regs(regs[i].regmap_entry,regs[i].regmap,MMREG,MMREG,MMREG);
7900       // assemble
7901       switch(itype[i]) {
7902         case ALU:
7903           alu_assemble(i,&regs[i]);break;
7904         case IMM8:
7905           imm8_assemble(i,&regs[i]);break;
7906         case SHIFTIMM:
7907           shiftimm_assemble(i,&regs[i]);break;
7908         case LOAD:
7909           load_assemble(i,&regs[i]);break;
7910         case STORE:
7911           store_assemble(i,&regs[i]);break;
7912         case RMW:
7913           rmw_assemble(i,&regs[i]);break;
7914         case PCREL:
7915           pcrel_assemble(i,&regs[i]);break;
7916         case MULTDIV:
7917           multdiv_assemble(i,&regs[i]);break;
7918         case MOV:
7919           mov_assemble(i,&regs[i]);break;
7920         case EXT:
7921           ext_assemble(i,&regs[i]);break;
7922         case FLAGS:
7923           flags_assemble(i,&regs[i]);break;
7924         case COMPLEX:
7925           complex_assemble(i,&regs[i]);break;
7926         case SYSTEM:
7927           system_assemble(i,&regs[i]);break;
7928         case BIOS:
7929           bios_assemble(i,&regs[i]);break;
7930         case UJUMP:
7931           ujump_assemble(i,&regs[i]);ds=1;break;
7932         case RJUMP:
7933           rjump_assemble(i,&regs[i]);ds=1;break;
7934         case CJUMP:
7935           cjump_assemble(i,&regs[i]);break;
7936         case SJUMP:
7937           sjump_assemble(i,&regs[i]);ds=1;break;
7938       }
7939       if(itype[i]==UJUMP||itype[i]==RJUMP||(source[i]>>16)==0x1000)
7940         literal_pool(1024);
7941       else
7942         literal_pool_jumpover(256);
7943     }
7944   }
7945   // If the block did not end with an unconditional branch,
7946   // add a jump to the next instruction.
7947   if(i>1) {
7948     if(itype[i-2]!=UJUMP&&itype[i-2]!=RJUMP&&itype[i-1]!=DATA) {
7949       assert(i==slen);
7950       if(itype[i-2]!=SJUMP) {
7951         store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*2);
7952         if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
7953           emit_loadreg(CCREG,HOST_CCREG);
7954         emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
7955       }
7956       else
7957       {
7958         store_regs_bt(regs[i-2].regmap,regs[i-2].dirty,start+i*2);
7959         assert(regs[i-2].regmap[HOST_CCREG]==CCREG);
7960       }
7961       add_to_linker((int)out,start+i*2,0);
7962       emit_jmp(0);
7963     }
7964   }
7965   else
7966   {
7967     assert(i>0);
7968     store_regs_bt(regs[i-1].regmap,regs[i-1].dirty,start+i*2);
7969     if(regs[i-1].regmap[HOST_CCREG]!=CCREG)
7970       emit_loadreg(CCREG,HOST_CCREG);
7971     emit_addimm(HOST_CCREG,CLOCK_DIVIDER*(ccadj[i-1]+1),HOST_CCREG);
7972     add_to_linker((int)out,start+i*2,0);
7973     emit_jmp(0);
7974   }
7975 
7976   // Stubs
7977   for(i=0;i<stubcount;i++)
7978   {
7979     switch(stubs[i][0])
7980     {
7981       case LOADB_STUB:
7982       case LOADW_STUB:
7983       case LOADL_STUB:
7984       case LOADS_STUB:
7985         do_readstub(i);break;
7986       case STOREB_STUB:
7987       case STOREW_STUB:
7988       case STOREL_STUB:
7989         do_writestub(i);break;
7990       case RMWT_STUB:
7991       case RMWA_STUB:
7992       case RMWX_STUB:
7993       case RMWO_STUB:
7994         do_rmwstub(i);break;
7995       case CC_STUB:
7996         do_ccstub(i);break;
7997     }
7998   }
7999   }
8000 
8001   /* Pass 9 - Linker */
8002   {
8003   u32 *ht_bin;
8004   int entry_point;
8005   u32 alignedlen;
8006   u32 alignedstart;
8007   u32 index;
8008   for(i=0;i<linkcount;i++)
8009   {
8010     assem_debug("%8x -> %8x\n",link_addr[i][0],link_addr[i][1]);
8011     literal_pool(64);
8012     if(!link_addr[i][2])
8013     {
8014       void *stub=out;
8015       void *addr=check_addr(link_addr[i][1]);
8016       emit_extjump(link_addr[i][0],link_addr[i][1]);
8017       if(addr) {
8018         set_jump_target(link_addr[i][0],(int)addr);
8019         add_link(link_addr[i][1],stub);
8020       }
8021       else set_jump_target(link_addr[i][0],(int)stub);
8022     }
8023     else
8024     {
8025       // Internal branch
8026       int target=(link_addr[i][1]-start)>>1;
8027       assert(target>=0&&target<slen);
8028       assert(instr_addr[target]);
8029       //#ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
8030       //set_jump_target_fillslot(link_addr[i][0],instr_addr[target],link_addr[i][2]>>1);
8031       //#else
8032       set_jump_target(link_addr[i][0],instr_addr[target]);
8033       //#endif
8034     }
8035   }
8036   // External Branch Targets (jump_in)
8037   if(copy+slen*2+4>shadow+sizeof(shadow)) copy=shadow;
8038   for(i=0;i<slen;i++)
8039   {
8040     if(bt[i]||i==0)
8041     {
8042       if(itype[i]==CJUMP||itype[i]==SJUMP) assert(instr_addr[i]);
8043       if(instr_addr[i]) // TODO - delay slots (=null)
8044       {
8045         u32 vaddr=start+i*2+slave;
8046         u32 page=(vaddr&0xDFFFFFFF)>>12;
8047         if(page>1024) page=1024+(page&1023);
8048         literal_pool(256);
8049         assem_debug("%8x (%d) <- %8x\n",instr_addr[i],i,start+i*2);
8050         assem_debug("jump_in: %x\n",start+i*2);
8051         ll_add(jump_dirty+page,vaddr,(void *)out);
8052         entry_point=do_dirty_stub(i);
8053         ll_add_nodup(jump_in+page,vaddr,(void *)entry_point);
8054         if((itype[i]==CJUMP||itype[i]==SJUMP)&&ccstub_return[i]) set_jump_target(ccstub_return[i],entry_point);
8055 
8056         // If there was an existing entry in the hash table,
8057         // replace it with the new address.
8058         // Don't add new entries.  We'll insert the
8059         // ones that actually get used in check_addr().
8060         ht_bin=hash_table[((vaddr>>16)^vaddr)&0xFFFF];
8061         if(ht_bin[0]==vaddr) {
8062           ht_bin[1]=entry_point;
8063         }
8064         if(ht_bin[2]==vaddr) {
8065           ht_bin[3]=entry_point;
8066         }
8067       }
8068     }
8069   }
8070   // Write out the literal pool if necessary
8071   literal_pool(0);
8072   #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK
8073   // Align code
8074   if(((u32)out)&7) emit_addnop(13);
8075   #endif
8076   assert((pointer)out-beginning<MAX_OUTPUT_BLOCK_SIZE);
8077   //printf("shadow buffer: %x-%x\n",(int)copy,(int)copy+slen*4);
8078   alignedlen=((((u32)source)+slen*2+2)&~2)-(u32)alignedsource;
8079   memcpy(copy,alignedsource,alignedlen);
8080   copy+=alignedlen;
8081 
8082   #ifdef __arm__
8083   __clear_cache((void *)beginning,out);
8084   #endif
8085 
8086   // If we're within 256K of the end of the buffer,
8087   // start over from the beginning. (Is 256K enough?)
8088   if((int)out>BASE_ADDR+(1<<TARGET_SIZE_2)-MAX_OUTPUT_BLOCK_SIZE-JUMP_TABLE_SIZE) out=(u8 *)BASE_ADDR;
8089 
8090   // Trap writes to any of the pages we compiled
8091   for(i=start>>12;i<=(start+slen*2)>>12;i++) {
8092     //invalid_code[i]=0;
8093     cached_code[i>>3]|=1<<(i&7);
8094     cached_code[(i^0x20000)>>3]|=1<<(i&7);
8095     #ifdef POINTERS_64BIT
8096     memory_map[i]|=0x4000000000000000LL;
8097     memory_map[i^0x20000]|=0x4000000000000000LL;
8098     #else
8099     memory_map[i]|=0x40000000;
8100     memory_map[i^0x20000]|=0x40000000;
8101     #endif
8102   }
8103   alignedstart=start&~3;
8104   index=alignedstart&0xDFFFFFFF;
8105   if(index>4194304) index=(addr|0x400000)&0x7fffff;
8106   for(i=0;i<alignedlen;i+=4) {
8107     cached_code_words[(index+i)>>5]|=1<<(((index+i)>>2)&7);
8108   }
8109   }
8110 
8111   /* Pass 10 - Free memory by expiring oldest blocks */
8112 
8113   {
8114   int end=((((int)out-BASE_ADDR)>>(TARGET_SIZE_2-16))+16384)&65535;
8115   while(expirep!=end)
8116   {
8117     int shift=TARGET_SIZE_2-3; // Divide into 8 blocks
8118     int base=BASE_ADDR+((expirep>>13)<<shift); // Base address of this block
8119     inv_debug("EXP: Phase %d\n",expirep);
8120     switch((expirep>>11)&3)
8121     {
8122       case 0:
8123         // Clear jump_in and jump_dirty
8124         ll_remove_matching_addrs(jump_in+(expirep&2047),base,shift);
8125         ll_remove_matching_addrs(jump_dirty+(expirep&2047),base,shift);
8126         break;
8127       case 1:
8128         // Clear pointers
8129         ll_kill_pointers(jump_out[expirep&2047],base,shift);
8130         break;
8131       case 2:
8132         // Clear hash table
8133         for(i=0;i<32;i++) {
8134           u32 *ht_bin=hash_table[((expirep&2047)<<5)+i];
8135           if((ht_bin[3]>>shift)==(base>>shift) ||
8136              ((ht_bin[3]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
8137             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[2],ht_bin[3]);
8138             ht_bin[2]=ht_bin[3]=-1;
8139           }
8140           if((ht_bin[1]>>shift)==(base>>shift) ||
8141              ((ht_bin[1]-MAX_OUTPUT_BLOCK_SIZE)>>shift)==(base>>shift)) {
8142             inv_debug("EXP: Remove hash %x -> %x\n",ht_bin[0],ht_bin[1]);
8143             ht_bin[0]=ht_bin[2];
8144             ht_bin[1]=ht_bin[3];
8145             ht_bin[2]=ht_bin[3]=-1;
8146           }
8147         }
8148         break;
8149       case 3:
8150         // Clear jump_out
8151         if((expirep&2047)==0) {
8152           #ifdef __arm__
8153           do_clear_cache();
8154           #endif
8155           #ifdef USE_MINI_HT
8156           memset(mini_ht_master,-1,sizeof(mini_ht_master));
8157           memset(mini_ht_slave,-1,sizeof(mini_ht_slave));
8158           #endif
8159         }
8160         ll_remove_matching_addrs(jump_out+(expirep&2047),base,shift);
8161         break;
8162     }
8163     expirep=(expirep+1)&65535;
8164   }
8165   }
8166   return 0;
8167 }
8168 
8169 #include "../sh2core.h"
8170 
8171 extern int framecounter;
DynarecMasterHandleInterrupts()8172 void DynarecMasterHandleInterrupts()
8173 {
8174   if (MSH2->interrupts[MSH2->NumberOfInterrupts-1].level > ((master_reg[SR]>>4)&0xF))
8175   {
8176     master_reg[15] -= 4;
8177     MappedMemoryWriteLongNocache(MSH2, master_reg[15], master_reg[SR]);
8178     master_reg[15] -= 4;
8179     MappedMemoryWriteLongNocache(MSH2, master_reg[15], master_pc);
8180     master_reg[SR] &= 0xFFFFFF0F;
8181     master_reg[SR] |= (MSH2->interrupts[MSH2->NumberOfInterrupts-1].level)<<4;
8182     master_pc = MappedMemoryReadLongNocache(MSH2, master_reg[VBR] + (MSH2->interrupts[MSH2->NumberOfInterrupts-1].vector << 2));
8183     master_ip = get_addr_ht(master_pc);
8184     MSH2->NumberOfInterrupts--;
8185     MSH2->isIdle = 0;
8186     MSH2->isSleeping = 0;
8187   }
8188   //printf("DynarecMasterHandleInterrupts pc=%x ip=%x\n",master_pc,(int)master_ip);
8189   //printf("master_cc=%d slave_cc=%d\n",master_cc,slave_cc);
8190   //printf("frame=%d\n",framecounter);
8191 }
8192 
DynarecSlaveHandleInterrupts()8193 void DynarecSlaveHandleInterrupts()
8194 {
8195   if (SSH2->interrupts[SSH2->NumberOfInterrupts-1].level > ((slave_reg[SR]>>4)&0xF))
8196   {
8197     slave_reg[15] -= 4;
8198     MappedMemoryWriteLongNocache(SSH2, slave_reg[15], slave_reg[SR]);
8199     slave_reg[15] -= 4;
8200     MappedMemoryWriteLongNocache(SSH2, slave_reg[15], slave_pc);
8201     slave_reg[SR] &= 0xFFFFFF0F;
8202     slave_reg[SR] |= (SSH2->interrupts[SSH2->NumberOfInterrupts-1].level)<<4;
8203     slave_pc = MappedMemoryReadLongNocache(SSH2, slave_reg[VBR] + (SSH2->interrupts[SSH2->NumberOfInterrupts-1].vector << 2));
8204     slave_ip = get_addr_ht(slave_pc|1);
8205     SSH2->NumberOfInterrupts--;
8206     SSH2->isIdle = 0;
8207     SSH2->isSleeping = 0;
8208   }
8209   //printf("DynarecSlaveHandleInterrupts pc=%x ip=%x\n",slave_pc,(int)slave_ip);
8210   //printf("master_cc=%d slave_cc=%d\n",master_cc,slave_cc);
8211 }
8212 
8213 void SH2InterpreterSendInterrupt(SH2_struct *context, u8 level, u8 vector);
8214 int SH2InterpreterGetInterrupts(SH2_struct *context,
8215                                 interrupt_struct interrupts[MAX_INTERRUPTS]);
8216 void SH2InterpreterSetInterrupts(SH2_struct *context, int num_interrupts,
8217                                  const interrupt_struct interrupts[MAX_INTERRUPTS]);
8218 
SH2DynarecInit(enum SHMODELTYPE model,SH2_struct * msh,SH2_struct * ssh)8219 int SH2DynarecInit(enum SHMODELTYPE model, SH2_struct *msh, SH2_struct *ssh) {return 0;}
8220 
SH2DynarecDeInit()8221 void SH2DynarecDeInit() {
8222   sh2_dynarec_cleanup();
8223 }
8224 
SH2DynarecExec(SH2_struct * context,u32 cycles)8225 void FASTCALL SH2DynarecExec(SH2_struct *context, u32 cycles) {
8226   printf("SH2DynarecExec called! oops\n");
8227   printf("master_ip=%x\n",(int)master_ip);
8228   exit(1);
8229 }
8230 
SH2DynarecGetSR(SH2_struct * context)8231 u32 SH2DynarecGetSR(SH2_struct *context)
8232 {
8233   if(context==MSH2)
8234     return master_reg[SR];
8235   else
8236     return slave_reg[SR];
8237 }
SH2DynarecGetGBR(SH2_struct * context)8238 u32 SH2DynarecGetGBR(SH2_struct *context)
8239 {
8240   if(context==MSH2)
8241     return master_reg[GBR];
8242   else
8243     return slave_reg[GBR];
8244 }
SH2DynarecGetVBR(SH2_struct * context)8245 u32 SH2DynarecGetVBR(SH2_struct *context)
8246 {
8247   if(context==MSH2)
8248     return master_reg[VBR];
8249   else
8250     return slave_reg[VBR];
8251 }
SH2DynarecGetMACH(SH2_struct * context)8252 u32 SH2DynarecGetMACH(SH2_struct *context)
8253 {
8254   if(context==MSH2)
8255     return master_reg[MACH];
8256   else
8257     return slave_reg[MACH];
8258 }
SH2DynarecGetMACL(SH2_struct * context)8259 u32 SH2DynarecGetMACL(SH2_struct *context)
8260 {
8261   if(context==MSH2)
8262     return master_reg[MACL];
8263   else
8264     return slave_reg[MACL];
8265 }
SH2DynarecGetPR(SH2_struct * context)8266 u32 SH2DynarecGetPR(SH2_struct *context)
8267 {
8268   if(context==MSH2)
8269     return master_reg[PR];
8270   else
8271     return slave_reg[PR];
8272 }
SH2DynarecGetGPR(SH2_struct * context,int num)8273 u32 SH2DynarecGetGPR(SH2_struct *context, int num)
8274 {
8275   if(context==MSH2)
8276     return master_reg[num];
8277   else
8278     return slave_reg[num];
8279 }
8280 
SH2DynarecGetPC(SH2_struct * context)8281 u32 SH2DynarecGetPC(SH2_struct *context)
8282 {
8283   if(context==MSH2)
8284     return master_pc;
8285   else
8286     return slave_pc;
8287 }
8288 
SH2DynarecSetSR(SH2_struct * context,u32 value)8289 void SH2DynarecSetSR(SH2_struct *context, u32 value) {
8290   if(context==MSH2)
8291     master_reg[SR]=value;
8292   else
8293     slave_reg[SR]=value;
8294 }
SH2DynarecSetGBR(SH2_struct * context,u32 value)8295 void SH2DynarecSetGBR(SH2_struct *context, u32 value) {
8296   if(context==MSH2)
8297     master_reg[GBR]=value;
8298   else
8299     slave_reg[GBR]=value;
8300 }
SH2DynarecSetVBR(SH2_struct * context,u32 value)8301 void SH2DynarecSetVBR(SH2_struct *context, u32 value) {
8302   if(context==MSH2)
8303     master_reg[VBR]=value;
8304   else
8305     slave_reg[VBR]=value;
8306 }
SH2DynarecSetMACH(SH2_struct * context,u32 value)8307 void SH2DynarecSetMACH(SH2_struct *context, u32 value) {
8308   if(context==MSH2)
8309     master_reg[MACH]=value;
8310   else
8311     slave_reg[MACH]=value;
8312 }
SH2DynarecSetMACL(SH2_struct * context,u32 value)8313 void SH2DynarecSetMACL(SH2_struct *context, u32 value) {
8314   if(context==MSH2)
8315     master_reg[MACL]=value;
8316   else
8317     slave_reg[MACL]=value;
8318 }
SH2DynarecSetPR(SH2_struct * context,u32 value)8319 void SH2DynarecSetPR(SH2_struct *context, u32 value) {
8320   if(context==MSH2)
8321     master_reg[PR]=value;
8322   else
8323     slave_reg[PR]=value;
8324 }
SH2DynarecSetGPR(SH2_struct * context,int num,u32 value)8325 void SH2DynarecSetGPR(SH2_struct *context, int num, u32 value) {
8326   if(context==MSH2)
8327     master_reg[num]=value;
8328   else
8329     slave_reg[num]=value;
8330 }
8331 
SH2DynarecSetPC(SH2_struct * context,u32 value)8332 void SH2DynarecSetPC(SH2_struct *context, u32 value) {
8333   //printf("SH2DynarecSetPC(%s,%x)\n",(context==MSH2)?"master":"slave",value);
8334   if(context==MSH2) {
8335     master_pc=value;
8336     master_ip=get_addr_ht(value);
8337   }
8338   else {
8339     slave_pc=value;
8340     slave_ip=get_addr_ht(value+1);
8341   }
8342 }
8343 
8344 #undef SR
8345 #undef GBR
8346 #undef VBR
8347 #undef MACH
8348 #undef MACL
8349 #undef PR
8350 
SH2DynarecGetRegisters(SH2_struct * context,sh2regs_struct * regs)8351 void SH2DynarecGetRegisters(SH2_struct *context, sh2regs_struct *regs)
8352 {
8353   if(context==MSH2)
8354     memcpy(&(regs->R), master_reg, 16*sizeof(int));
8355   else
8356     memcpy(&(regs->R), slave_reg, 16*sizeof(int));
8357   regs->SR.all=SH2DynarecGetSR(context);
8358   regs->GBR=SH2DynarecGetGBR(context);
8359   regs->VBR=SH2DynarecGetVBR(context);
8360   regs->MACH=SH2DynarecGetMACH(context);
8361   regs->MACL=SH2DynarecGetMACL(context);
8362   regs->PR=SH2DynarecGetPR(context);
8363   regs->PC=SH2DynarecGetPC(context);
8364 }
8365 
SH2DynarecSetRegisters(SH2_struct * context,const sh2regs_struct * regs)8366 void SH2DynarecSetRegisters(SH2_struct *context, const sh2regs_struct *regs)
8367 {
8368   if(context==MSH2)
8369     memcpy(master_reg, &(regs->R), 16*sizeof(int));
8370   else
8371     memcpy(slave_reg, &(regs->R), 16*sizeof(int));
8372   SH2DynarecSetSR(context, regs->SR.all);
8373   SH2DynarecSetGBR(context, regs->GBR);
8374   SH2DynarecSetVBR(context, regs->VBR);
8375   SH2DynarecSetMACH(context, regs->MACH);
8376   SH2DynarecSetMACL(context, regs->MACL);
8377   SH2DynarecSetPR(context, regs->PR);
8378   SH2DynarecSetPC(context, regs->PC);
8379 }
8380 
SH2DynarecWriteNotify(u32 start,u32 length)8381 void SH2DynarecWriteNotify(u32 start, u32 length) {
8382   int block,wp=0;
8383   // Ignore non-RAM regions
8384   if((start&0xDFF00000)!=0x200000&&(start&0xDE000000)!=0x6000000) return;
8385   // Check if any pages contain compiled code
8386   for(block=start>>12;block<=(start+length-1)>>12;block++)
8387     wp|=((cached_code[block>>3]>>(block&7))&1);
8388   if(!wp) return;
8389   //printf("SH2DynarecWriteNotify(%x,%x)\n",start,length);
8390   invalidate_blocks(start>>12,(start+length-1)>>12);
8391 }
8392 
8393 SH2Interface_struct SH2Dynarec = {
8394    SH2CORE_DYNAREC,
8395    "SH2 Dynamic Recompiler",
8396 
8397    SH2DynarecInit,
8398    SH2DynarecDeInit,
8399    SH2DynarecReset,
8400    SH2DynarecExec,
8401 
8402    SH2DynarecGetRegisters,
8403    SH2DynarecGetGPR,
8404    SH2DynarecGetSR,
8405    SH2DynarecGetGBR,
8406    SH2DynarecGetVBR,
8407    SH2DynarecGetMACH,
8408    SH2DynarecGetMACL,
8409    SH2DynarecGetPR,
8410    SH2DynarecGetPC,
8411 
8412    SH2DynarecSetRegisters,
8413    SH2DynarecSetGPR,
8414    SH2DynarecSetSR,
8415    SH2DynarecSetGBR,
8416    SH2DynarecSetVBR,
8417    SH2DynarecSetMACH,
8418    SH2DynarecSetMACL,
8419    SH2DynarecSetPR,
8420    SH2DynarecSetPC,
8421 
8422    SH2InterpreterSendInterrupt,
8423    SH2InterpreterGetInterrupts,
8424    SH2InterpreterSetInterrupts,
8425 
8426    SH2DynarecWriteNotify
8427 };
8428 
8429 u32 * decilinestop_p = &yabsys.DecilineStop;
8430 u32 * decilineusec_p = &yabsys.DecilineUsec;
8431 u32 * SH2CycleFrac_p = &yabsys.SH2CycleFrac;
8432 u32 * UsecFrac_p = &yabsys.UsecFrac;
8433 //u32 decilinecycles = yabsys.DecilineStop >> YABSYS_TIMING_BITS;
8434 u32 yabsys_timing_bits = YABSYS_TIMING_BITS;
8435 u32 yabsys_timing_mask = YABSYS_TIMING_MASK;
8436 int * linecount_p = &yabsys.LineCount;
8437 int * vblanklinecount_p = &yabsys.VBlankLineCount;
8438 int * maxlinecount_p = &yabsys.MaxLineCount;
8439 
8440 void * NumberOfInterruptsOffset = &((SH2_struct *)0)->NumberOfInterrupts;
8441