1 /* $Id: rc-x86-rws.c,v 1.5 2010/06/05 19:14:40 fredette Exp $ */
2 
3 /* libtme/host/x86/rc-x86-rws.c - x86 host recode reads and writes support: */
4 
5 /*
6  * Copyright (c) 2008 Matt Fredette
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *      This product includes software developed by Matt Fredette.
20  * 4. The name of the author may not be used to endorse or promote products
21  *    derived from this software without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
24  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
27  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  * POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 _TME_RCSID("$Id: rc-x86-rws.c,v 1.5 2010/06/05 19:14:40 fredette Exp $");
37 
38 /* this emits instructions to do a byte swap in a host register: */
39 static tme_uint8_t *
_tme_recode_x86_rw_bswap(tme_uint8_t * thunk_bytes,unsigned int size,unsigned int reg_x86)40 _tme_recode_x86_rw_bswap(tme_uint8_t *thunk_bytes,
41 			 unsigned int size,
42 			 unsigned int reg_x86)
43 {
44   unsigned int rex;
45 
46   /* NB: we don't have to worry about zero-truncation on an x86-64
47      host; if this register is supposed to be sign-extended, we do
48      that after all byte swapping: */
49 
50   /* if this is an eight-bit byte swap: */
51   if (size == TME_RECODE_SIZE_8) {
52     /* nothing to do */
53   }
54 
55   /* otherwise, if this is a 16-bit byte swap: */
56   else if (size == TME_RECODE_SIZE_16) {
57 
58     /* if this register has a high 8-bit encoding: */
59     if (reg_x86 < TME_RECODE_X86_REG_SP) {
60 
61       /* emit an xchgb %regh, %regl: */
62       thunk_bytes[0]
63 	= (TME_RECODE_X86_OPCODE_BINOP_XCHG
64 	   + TME_RECODE_X86_OPCODE_BINOP_Gb_Eb);
65       thunk_bytes[1]
66 	= TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_REG(reg_x86),
67 				      TME_RECODE_X86_REG((TME_RECODE_X86_REG_SP + reg_x86)));
68       thunk_bytes += 2;
69     }
70 
71     /* otherwise, this register doesn't have a high 8-bit encoding: */
72     else {
73 
74       /* emit a rorw $8, %reg: */
75       thunk_bytes[0] = TME_RECODE_X86_PREFIX_OPSIZ;
76       thunk_bytes[1] = TME_RECODE_X86_OPCODE_GRP2_Ib_Ev;
77       thunk_bytes[2]
78 	= TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_REG(reg_x86),
79 				      TME_RECODE_X86_OPCODE_GRP2_ROR);
80       thunk_bytes[3] = 8;
81       thunk_bytes += 4;
82     }
83   }
84 
85   /* otherwise, this is a 32-bit swap, and/or a host-sized byte
86      swap: */
87   else {
88 
89     /* if the bswap instruction is available: */
90     if (1) {
91 
92       /* emit a bswap %reg: */
93       rex = TME_RECODE_X86_REX_R(size, reg_x86);
94       if (rex != 0) {
95 	*(thunk_bytes++) = rex;
96       }
97       thunk_bytes[0] = TME_RECODE_X86_OPCODE_ESC_0F;
98       thunk_bytes[1] = TME_RECODE_X86_OPCODE0F_BSWAP(reg_x86);
99       thunk_bytes += 2;
100     }
101 
102     /* otherwise, the bswap instruction is not available: */
103     else {
104 
105       /* this must be a 32-bit swap: */
106       assert (size == TME_RECODE_SIZE_32);
107 
108       /* emit:
109 	 rorw $8, %reg
110 	 rorl $16, %reg
111 	 rorw $8, %reg
112       */
113       thunk_bytes[0] = TME_RECODE_X86_PREFIX_OPSIZ;
114       thunk_bytes[1] = TME_RECODE_X86_OPCODE_GRP2_Ib_Ev;
115       thunk_bytes[2]
116 	= TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_REG(reg_x86),
117 				      TME_RECODE_X86_OPCODE_GRP2_ROR);
118       thunk_bytes[3] = 8;
119       thunk_bytes += 4;
120       thunk_bytes[0] = TME_RECODE_X86_OPCODE_GRP2_Ib_Ev;
121       thunk_bytes[1]
122 	= TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_REG(reg_x86),
123 				      TME_RECODE_X86_OPCODE_GRP2_ROR);
124       thunk_bytes[2] = 16;
125       thunk_bytes += 3;
126       thunk_bytes[0] = TME_RECODE_X86_PREFIX_OPSIZ;
127       thunk_bytes[1] = TME_RECODE_X86_OPCODE_GRP2_Ib_Ev;
128       thunk_bytes[2]
129 	= TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_REG(reg_x86),
130 				      TME_RECODE_X86_OPCODE_GRP2_ROR);
131       thunk_bytes[3] = 8;
132       thunk_bytes += 4;
133     }
134   }
135 
136   return (thunk_bytes);
137 }
138 
139 /* this host function returns a new read/write thunk: */
140 struct tme_recode_rw_thunk *
tme_recode_host_rw_thunk_new(struct tme_recode_ic * ic,const struct tme_recode_rw * rw)141 tme_recode_host_rw_thunk_new(struct tme_recode_ic *ic,
142 			     const struct tme_recode_rw *rw)
143 {
144   struct tme_recode_rw_thunk *rw_thunk;
145   unsigned int max_boundaries_guest;
146   unsigned int max_boundaries_host;
147   struct tme_recode_x86_tlb_type x86_tlb_type;
148   unsigned int reg_x86_address;
149   unsigned int reg_host_value_0;
150   unsigned int reg_host_value_1;
151   struct tme_recode_insn insn_buffer;
152   tme_uint8_t *thunk_bytes;
153   unsigned int rex;
154   unsigned int reg_host_value_orig;
155   int stack_adjust;
156   unsigned int reg_size;
157   int memory_signed;
158 
159   /* start the new read/write thunk: */
160   if (!tme_recode_host_thunk_start(ic)) {
161     abort();
162   }
163   rw_thunk = tme_new(struct tme_recode_rw_thunk, 1);
164   rw_thunk->tme_recode_x86_rw_thunk_subs
165     = tme_recode_build_to_thunk_off(ic, ic->tme_recode_ic_thunk_build_next);
166 
167   /* assume that we will always need to assist: */
168   x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp_address_ok = (tme_uint8_t *) NULL;
169   x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp = (tme_uint8_t *) NULL;
170 
171   /* get the worst-case maximum number of guest bus boundaries this
172      read/write could cross: */
173   max_boundaries_guest
174     = (rw->tme_recode_rw_bus_boundary == 0
175        ? 0
176        : (((TME_MAX(rw->tme_recode_rw_bus_boundary,
177 		    rw->tme_recode_rw_address_type.tme_recode_address_type_align_min)
178 	    - rw->tme_recode_rw_address_type.tme_recode_address_type_align_min)
179 	   + (TME_BIT(rw->tme_recode_rw_memory_size - TME_RECODE_SIZE_8)
180 	      - 1))
181 	  / rw->tme_recode_rw_bus_boundary));
182 
183   /* get the worst-case maximum number of host bus boundaries this
184      read/write could cross: */
185   max_boundaries_host
186     = (((TME_MAX(TME_MEMORY_BUS_BOUNDARY,
187 		 rw->tme_recode_rw_address_type.tme_recode_address_type_align_min)
188 	 - rw->tme_recode_rw_address_type.tme_recode_address_type_align_min)
189 	+ (TME_BIT(rw->tme_recode_rw_memory_size - TME_RECODE_SIZE_8)
190 	   - 1))
191        / TME_MEMORY_BUS_BOUNDARY);
192 
193   /* NB: as of 20080906, Intel's "Intel 64 Architecture Memory
194      Ordering White Paper" (Order number 318147-001, from August 2007)
195      only guarantees that size-aligned reads and writes up to 64 bits
196      are atomic; we assume that non-Intel processors are similar.
197      until this guarantee is extended to cover size-aligned 128-bit
198      reads and writes, TME_MEMORY_BUS_BOUNDARY shouldn't be more than
199      sizeof(tme_uint64_t).
200 
201      this means that we will always assist guest 128-bit reads
202      and writes that might be atomic for the guest (since we will
203      detect above that we might cross more bus boundaries than the
204      guest might): */
205 
206   /* if threads are cooperative, or if common atomic operations aren't
207      being done under software lock and we can't cross more boundaries
208      than the guest would: */
209   if (TME_THREADS_COOPERATIVE
210       || (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0
211 	  && max_boundaries_host <= max_boundaries_guest)) {
212 
213     /* get the TLB type for the address type: */
214     tme_recode_address_type_tlb_type(ic,
215 				     &rw->tme_recode_rw_address_type,
216 				     &x86_tlb_type.tme_recode_tlb_type);
217 
218     /* XXX FIXME - document read/write thunk calling convention, how
219        it differs from normal subs */
220 
221     /* for a double-host-size guest, the guest address is in the a:bp
222        register pair, otherwise it's in the a register.  NB that we
223        primarily deal with only a host-sized part: */
224     reg_x86_address
225       = tme_recode_x86_reg_from_host[_tme_recode_x86_tlb_reg_host_address(ic)];
226 
227     /* for a write, the value to write is in the first host register
228        (pair).  for a read, the value read is returned in the same
229        host register (pair) that was used for the address: */
230     if (rw->tme_recode_rw_write) {
231       reg_host_value_0 = TME_RECODE_REG_HOST(0);
232     }
233     else {
234       reg_host_value_0
235 	= _tme_recode_x86_tlb_reg_host_address(ic);
236       assert (tme_recode_x86_reg_from_host[reg_host_value_0] == reg_x86_address);
237     }
238     reg_host_value_1 = reg_host_value_0 + 1;
239 
240     /* find, busy, and check a data TLB entry: */
241     _tme_recode_x86_tlb_busy(ic,
242 			     &rw->tme_recode_rw_address_type,
243 			     &x86_tlb_type);
244 
245     /* start more instructions: */
246     tme_recode_x86_insns_start(ic, thunk_bytes);
247 
248     /* if this is a write, and we need to byte-swap the value to
249        write: */
250     if (rw->tme_recode_rw_write
251 	&& rw->tme_recode_rw_memory_size > TME_RECODE_SIZE_8
252 	&& rw->tme_recode_rw_memory_endian != TME_ENDIAN_NATIVE) {
253 
254       /* we will byte-swap the value to write into (at least) the
255 	 TLB scratch register: */
256       reg_host_value_orig = reg_host_value_0;
257       assert (reg_host_value_1 == reg_host_value_0 + 1);
258       reg_host_value_0 = TME_RECODE_REG_HOST_UNDEF;
259       assert (tme_recode_x86_reg_from_host[reg_host_value_0] == TME_RECODE_X86_REG_TLB_SCRATCH);
260 
261       /* if this is a double-host-size write: */
262       if (TME_RECODE_SIZE_IS_DOUBLE_HOST(rw->tme_recode_rw_memory_size)) {
263 
264 	/* we will use the most-significant half of the guest register
265 	   that held the address for the higher-in-memory (i.e., the
266 	   guest least-significant) part of the value.  NB that we are
267 	   swapping guest register halves here, too: */
268 	reg_host_value_1 = TME_RECODE_X86_REG_HOST_SUBS_SRC1 + 1;
269 	assert (tme_recode_x86_reg_from_host[reg_host_value_1 - 1] == reg_x86_address);
270 	_tme_recode_x86_emit_reg_copy(thunk_bytes,
271 				      tme_recode_x86_reg_from_host[reg_host_value_orig + 0],
272 				      tme_recode_x86_reg_from_host[reg_host_value_1]);
273 	thunk_bytes
274 	  = _tme_recode_x86_rw_bswap(thunk_bytes,
275 				     TME_RECODE_SIZE_HOST,
276 				     tme_recode_x86_reg_from_host[reg_host_value_1]);
277 	reg_host_value_orig += 1;
278       }
279 
280       /* copy and swap the lower-in-memory (i.e., the guest
281 	 most-significant) part of the value: */
282       _tme_recode_x86_emit_reg_copy(thunk_bytes,
283 				    tme_recode_x86_reg_from_host[reg_host_value_orig],
284 				    tme_recode_x86_reg_from_host[reg_host_value_0]);
285       thunk_bytes
286 	= _tme_recode_x86_rw_bswap(thunk_bytes,
287 				   TME_MIN(rw->tme_recode_rw_memory_size,
288 					   TME_RECODE_SIZE_HOST),
289 				   tme_recode_x86_reg_from_host[reg_host_value_0]);
290     }
291 
292     /* if threads aren't cooperative, and this is a double-host-size
293        access: */
294     if (!TME_THREADS_COOPERATIVE
295 	&& TME_RECODE_SIZE_IS_DOUBLE_HOST(rw->tme_recode_rw_memory_size)) {
296 
297       /* NB: in this case, we assume that the host is capable of SSE2
298 	 instructions.  this seems reasonable: */
299 
300       /* the x86-64 ABI requires that the stack pointer be 16-byte
301 	 aligned immediately before a call instruction.  inside an
302 	 insn thunk, the stack pointer is 16-byte aligned immediately
303 	 before a call to a read/write thunk, which means at the
304 	 beginning of a read/write thunk it is only 8-byte aligned
305 	 (because of the return address for the read/write thunk).
306 
307 	 on x86-64, we want to use at least one movdqa, which requires
308 	 us to align the stack pointer: */
309       stack_adjust
310 	= (TME_RECODE_SIZE_HOST == TME_RECODE_SIZE_32
311 	   ? 0
312 	   : TME_BIT(TME_RECODE_SIZE_HOST - TME_RECODE_SIZE_8));
313 
314       /* if this is a write: */
315       if (rw->tme_recode_rw_write) {
316 
317 	/* do any stack pointer alignment: */
318 	if (stack_adjust) {
319 	  thunk_bytes = _tme_recode_x86_emit_adjust_sp(thunk_bytes, -stack_adjust);
320 	}
321 
322 	/* push the double-host-size value to write: */
323 	_tme_recode_x86_emit_reg_push(thunk_bytes, tme_recode_x86_reg_from_host[reg_host_value_1]);
324 	_tme_recode_x86_emit_reg_push(thunk_bytes, tme_recode_x86_reg_from_host[reg_host_value_0]);
325 
326 	/* after the write, we will need to discard the value: */
327 	stack_adjust += TME_BIT(rw->tme_recode_rw_memory_size - TME_RECODE_SIZE_8);
328       }
329 
330       /* otherwise, this is a read: */
331       else {
332 
333 	/* do any stack pointer alignment, and make space for the
334 	   double-host-size value to read: */
335 	thunk_bytes
336 	  = _tme_recode_x86_emit_adjust_sp(thunk_bytes,
337 					   -(stack_adjust
338 					     + TME_BIT(rw->tme_recode_rw_memory_size
339 						       - TME_RECODE_SIZE_8)));
340       }
341 
342       /* emit one of:
343 	 movq (%esp), %xmm0
344 	 movdqa (%esp), %xmm0
345 	 movq (%address), %xmm0
346 	 movdqa (%address), %xmm0
347 
348 	 to read the value to read or write into %xmm0:
349       */
350       thunk_bytes[0]
351 	= ((TME_RECODE_SIZE_HOST == TME_RECODE_SIZE_32
352 	    || (!rw->tme_recode_rw_write
353 		&& (rw->tme_recode_rw_address_type.tme_recode_address_type_align_min
354 		    < TME_BIT(rw->tme_recode_rw_memory_size - TME_RECODE_SIZE_8))))
355 	   ? TME_RECODE_X86_PREFIX_REP
356 	   : TME_RECODE_X86_PREFIX_OPSIZ);
357       thunk_bytes[1] = TME_RECODE_X86_OPCODE_ESC_0F;
358       thunk_bytes[2]
359 	= (TME_RECODE_SIZE_HOST == TME_RECODE_SIZE_32
360 	   ? TME_RECODE_X86_OPCODEF30F_MOVQ_Wq_Vq
361 	   : thunk_bytes[0] == TME_RECODE_X86_PREFIX_REP
362 	   ? TME_RECODE_X86_OPCODEF30F_MOVDQU_Wdq_Vdq
363 	   : TME_RECODE_X86_OPCODE660F_MOVDQA_Wdq_Vdq);
364       if (rw->tme_recode_rw_write) {
365 	thunk_bytes[3]
366 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA(TME_RECODE_X86_EA_BASE_SIB),
367 					TME_RECODE_X86_REG(TME_RECODE_X86_REG_XMM(0)));
368 	thunk_bytes[4]
369 	  = TME_RECODE_X86_SIB(TME_RECODE_X86_REG_SP, TME_RECODE_X86_SIB_INDEX_NONE, 1);
370 	thunk_bytes += 5;
371       }
372       else {
373 	thunk_bytes[3]
374 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA(reg_x86_address),
375 					TME_RECODE_X86_REG(TME_RECODE_X86_REG_XMM(0)));
376 	thunk_bytes += 4;
377       }
378 
379       /* emit one of:
380 	 movq %xmm0, (%address)
381 	 movdqu %xmm0, (%address)
382 	 movdqa %xmm0, (%address)
383 	 movq %xmm0, (%esp)
384 	 movdqa %xmm0, (%esp)
385 
386 	 to write the value to read or write in %xmm0:
387       */
388       thunk_bytes[0]
389 	= ((TME_RECODE_SIZE_HOST == TME_RECODE_SIZE_32
390 	    || (rw->tme_recode_rw_write
391 		&& (rw->tme_recode_rw_address_type.tme_recode_address_type_align_min
392 		    >= TME_BIT(rw->tme_recode_rw_memory_size - TME_RECODE_SIZE_8))))
393 	   ? TME_RECODE_X86_PREFIX_OPSIZ
394 	   : TME_RECODE_X86_PREFIX_REP);
395       thunk_bytes[1] = TME_RECODE_X86_OPCODE_ESC_0F;
396       thunk_bytes[2]
397 	= (TME_RECODE_SIZE_HOST == TME_RECODE_SIZE_32
398 	   ? TME_RECODE_X86_OPCODE660F_MOVQ_Vq_Wq
399 	   : thunk_bytes[0] == TME_RECODE_X86_PREFIX_REP
400 	   ? TME_RECODE_X86_OPCODEF30F_MOVDQU_Vdq_Wdq
401 	   : TME_RECODE_X86_OPCODE660F_MOVDQA_Vdq_Wdq);
402       if (rw->tme_recode_rw_write) {
403 	thunk_bytes[3]
404 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA(reg_x86_address),
405 					TME_RECODE_X86_REG(TME_RECODE_X86_REG_XMM(0)));
406 	thunk_bytes += 4;
407       }
408       else {
409 	thunk_bytes[3]
410 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA(TME_RECODE_X86_EA_BASE_SIB),
411 					TME_RECODE_X86_REG(TME_RECODE_X86_REG_XMM(0)));
412 	thunk_bytes[4]
413 	  = TME_RECODE_X86_SIB(TME_RECODE_X86_REG_SP, TME_RECODE_X86_SIB_INDEX_NONE, 1);
414 	thunk_bytes += 5;
415       }
416 
417       /* if this was a read: */
418       if (!rw->tme_recode_rw_write) {
419 
420 	/* pop the double-host-size value we read: */
421 	_tme_recode_x86_emit_reg_pop(thunk_bytes, tme_recode_x86_reg_from_host[reg_host_value_0]);
422 	_tme_recode_x86_emit_reg_pop(thunk_bytes, tme_recode_x86_reg_from_host[reg_host_value_1]);
423       }
424 
425       /* discard any double-host-size value we wrote, and any stack
426 	 pointer alignment: */
427       if (stack_adjust) {
428 	thunk_bytes = _tme_recode_x86_emit_adjust_sp(thunk_bytes, -stack_adjust);
429       }
430     }
431 
432     /* otherwise, either threads are cooperative, or this isn't a
433        double-host-size access: */
434 
435     /* if this is a write: */
436     else if (rw->tme_recode_rw_write) {
437 
438       /* if this is an ia32 host, and an 8-bit store of a register
439 	 that doesn't have an 8-bit encoding: */
440       if (TME_RECODE_SIZE_HOST <= TME_RECODE_SIZE_32
441 	  && rw->tme_recode_rw_memory_size == TME_RECODE_SIZE_8
442 	  && tme_recode_x86_reg_from_host[reg_host_value_0] >= TME_RECODE_X86_REG_SP) {
443 
444 	/* we will copy the value to write into the TLB scratch
445 	   register, which has an 8-bit encoding: */
446 	reg_host_value_orig = reg_host_value_0;
447 	reg_host_value_0 = TME_RECODE_REG_HOST_UNDEF;
448 	assert (tme_recode_x86_reg_from_host[reg_host_value_0] < TME_RECODE_X86_REG_SP);
449 	assert (tme_recode_x86_reg_from_host[reg_host_value_0] == TME_RECODE_X86_REG_TLB_SCRATCH);
450 	_tme_recode_x86_emit_reg_copy(thunk_bytes,
451 				      tme_recode_x86_reg_from_host[reg_host_value_orig],
452 				      tme_recode_x86_reg_from_host[reg_host_value_0]);
453       }
454 
455       /* emit one of:
456 	 movb %reg, (%address)
457 	 movw %reg, (%address)
458 	 movl %reg, (%address)
459 	 movq %reg, (%address)
460       */
461       if (rw->tme_recode_rw_memory_size == TME_RECODE_SIZE_16) {
462 	*(thunk_bytes++) = TME_RECODE_X86_PREFIX_OPSIZ;
463       }
464       rex
465 	= (TME_RECODE_X86_REX_R(TME_MIN(rw->tme_recode_rw_memory_size,
466 					TME_RECODE_SIZE_HOST),
467 				tme_recode_x86_reg_from_host[reg_host_value_0])
468 	   | TME_RECODE_X86_REX_B(0, reg_x86_address));
469       if (rex != 0) {
470 	*(thunk_bytes++) = rex;
471       }
472       thunk_bytes[0]
473 	= (rw->tme_recode_rw_memory_size >= TME_RECODE_SIZE_16
474 	   ? (TME_RECODE_X86_OPCODE_BINOP_MOV + TME_RECODE_X86_OPCODE_BINOP_Gv_Ev)
475 	   : (TME_RECODE_X86_OPCODE_BINOP_MOV + TME_RECODE_X86_OPCODE_BINOP_Gb_Eb));
476       /* NB: a disp8 EA must be used when the base register is bp or r13: */
477       if (TME_RECODE_X86_REG(reg_x86_address) == TME_RECODE_X86_REG_BP) {
478 	thunk_bytes[1]
479 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA_DISP8(reg_x86_address),
480 					TME_RECODE_X86_REG(tme_recode_x86_reg_from_host[reg_host_value_0]));
481 	thunk_bytes[2] = 0;
482 	thunk_bytes += 3;
483       }
484       else {
485 	thunk_bytes[1]
486 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA(reg_x86_address),
487 					TME_RECODE_X86_REG(tme_recode_x86_reg_from_host[reg_host_value_0]));
488 	thunk_bytes += 2;
489       }
490 
491       /* if this is a double-host-size write: */
492       if (TME_RECODE_SIZE_IS_DOUBLE_HOST(rw->tme_recode_rw_memory_size)) {
493 
494 	/* emit one of:
495 	   movl %reg, 4(%address)
496 	   movq %reg, 8(%address)
497 	*/
498 	rex
499 	  = (TME_RECODE_X86_REX_R(TME_RECODE_SIZE_HOST,
500 				  tme_recode_x86_reg_from_host[reg_host_value_1])
501 	   | TME_RECODE_X86_REX_B(0, reg_x86_address));
502 	if (rex != 0) {
503 	  *(thunk_bytes++) = rex;
504 	}
505 	thunk_bytes[0] = TME_RECODE_X86_OPCODE_BINOP_MOV + TME_RECODE_X86_OPCODE_BINOP_Gv_Ev;
506 	thunk_bytes[1]
507 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA_DISP8(reg_x86_address),
508 					TME_RECODE_X86_REG(tme_recode_x86_reg_from_host[reg_host_value_1]));
509 	thunk_bytes[2] = TME_BIT(TME_RECODE_SIZE_HOST - TME_RECODE_SIZE_8);
510 	thunk_bytes += 3;
511       }
512     }
513 
514     /* otherwise, this is a read: */
515     else {
516 
517       /* if this is a double-host-size read: */
518       if (TME_RECODE_SIZE_IS_DOUBLE_HOST(rw->tme_recode_rw_memory_size)) {
519 
520 	/* emit one of:
521 	   movl 4(%address), %reg
522 	   movq 8(%address), %reg
523 	*/
524 	rex
525 	  = (TME_RECODE_X86_REX_R(TME_RECODE_SIZE_HOST,
526 				  tme_recode_x86_reg_from_host[reg_host_value_1])
527 	     | TME_RECODE_X86_REX_B(0, reg_x86_address));
528 	if (rex != 0) {
529 	  *(thunk_bytes++) = rex;
530 	}
531 	thunk_bytes[0] = TME_RECODE_X86_OPCODE_BINOP_MOV + TME_RECODE_X86_OPCODE_BINOP_Ev_Gv;
532 	thunk_bytes[1]
533 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA_DISP8(reg_x86_address),
534 					TME_RECODE_X86_REG(tme_recode_x86_reg_from_host[reg_host_value_1]));
535 	thunk_bytes[2] = TME_BIT(TME_RECODE_SIZE_HOST - TME_RECODE_SIZE_8);
536 	thunk_bytes += 3;
537       }
538 
539       /* assume that we we will read into all of a host register: */
540       reg_size = TME_MAX(rw->tme_recode_rw_reg_size, TME_RECODE_SIZE_HOST);
541 
542       /* by default, in the read instruction itself we zero-extend the
543 	 value into all of a host register.  the only time we will
544 	 sign-extend in the read instruction itself is when the
545 	 register size is greater than the memory size, memory is
546 	 signed, and we're only reading a byte or the guest's byte
547 	 order matches the host.  if the first two are true, but we're
548 	 reading more than a byte and the guest's byte order doesn't
549 	 match the host, we have to wait to do the sign extension
550 	 after we've byte swapped the value read: */
551       memory_signed
552 	= (rw->tme_recode_rw_reg_size > rw->tme_recode_rw_memory_size
553 	   && rw->tme_recode_rw_memory_signed
554 	   && (rw->tme_recode_rw_memory_size == TME_RECODE_SIZE_8
555 	       || rw->tme_recode_rw_memory_endian == TME_ENDIAN_NATIVE));
556 
557       /* if this is an x86-64 host and a 32-bit sign- or zero-extended load: */
558       if (TME_RECODE_SIZE_HOST > TME_RECODE_SIZE_32
559 	  && rw->tme_recode_rw_memory_size == TME_RECODE_SIZE_32
560 	  && rw->tme_recode_rw_reg_size > rw->tme_recode_rw_memory_size) {
561 
562 	/* if this is a zero-extended load, or if the guest's byte
563 	   order doesn't match the host: */
564 	if (!memory_signed) {
565 
566 	  /* read into only the least-significant 32 bits of the
567 	     register.  this will zero-extend the read to all 64
568 	     bits.  this should prevent a rex prefix: */
569 	  reg_size = TME_RECODE_SIZE_32;
570 	}
571       }
572 
573       /* emit any rex prefix: */
574       rex
575 	= (TME_RECODE_X86_REX_B(0, reg_x86_address)
576 	   | TME_RECODE_X86_REX_R(TME_MIN(reg_size,
577 					  TME_RECODE_SIZE_HOST),
578 				  tme_recode_x86_reg_from_host[reg_host_value_0]));
579       if (rex != 0) {
580 	*(thunk_bytes++) = rex;
581       }
582 
583       /* if this is an x86-64 host and a 32-bit sign-extended load: */
584       if (TME_RECODE_SIZE_HOST > TME_RECODE_SIZE_32
585 	  && rw->tme_recode_rw_memory_size == TME_RECODE_SIZE_32
586 	  && reg_size > TME_RECODE_SIZE_32) {
587 
588 	/* emit the opcode part of a movslq (%address), %reg: */
589 	thunk_bytes[0] = TME_RECODE_X86_OPCODE_MOVS_El_Gv;
590       }
591 
592       /* otherwise, if this is an 8- or 16-bit load: */
593       else if (rw->tme_recode_rw_memory_size <= TME_RECODE_SIZE_16) {
594 
595 	/* emit the opcode part of one of:
596 	   movsb (%address), %reg
597 	   movzb (%address), %reg
598 	   movsw (%address), %reg
599 	   movzw (%address), %reg
600 	*/
601 	*(thunk_bytes++) = TME_RECODE_X86_OPCODE_ESC_0F;
602 	thunk_bytes[0]
603 	  = (rw->tme_recode_rw_memory_size == TME_RECODE_SIZE_8
604 	     ? (memory_signed
605 		? TME_RECODE_X86_OPCODE0F_MOVS_Eb_Gv
606 		: TME_RECODE_X86_OPCODE0F_MOVZ_Eb_Gv)
607 	     : (memory_signed
608 		? TME_RECODE_X86_OPCODE0F_MOVS_Ew_Gv
609 		: TME_RECODE_X86_OPCODE0F_MOVZ_Ew_Gv));
610       }
611 
612       /* otherwise, this load is double-host-size, or host-size, or a
613 	 32-bit zero-extended load on an x86-64 host: */
614       else {
615 
616 	/* emit the opcode part of a movl (%address), %reg or a movq (%address), %reg: */
617 	thunk_bytes[0]
618 	  = (TME_RECODE_X86_OPCODE_BINOP_MOV
619 	     + TME_RECODE_X86_OPCODE_BINOP_Ev_Gv);
620       }
621 
622       /* emit the modR/M byte for this instruction: */
623       /* NB: a disp8 EA must be used when the base register is bp or r13: */
624       if (TME_RECODE_X86_REG(reg_x86_address) == TME_RECODE_X86_REG_BP) {
625 	thunk_bytes[1]
626 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA_DISP8(reg_x86_address),
627 					TME_RECODE_X86_REG(tme_recode_x86_reg_from_host[reg_host_value_0]));
628 	thunk_bytes[2] = 0;
629 	thunk_bytes += 3;
630       }
631       else {
632 	thunk_bytes[1]
633 	  = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_EA(reg_x86_address),
634 					TME_RECODE_X86_REG(tme_recode_x86_reg_from_host[reg_host_value_0]));
635 	thunk_bytes += 2;
636       }
637     }
638 
639     /* if this is a read: */
640     if (!rw->tme_recode_rw_write) {
641 
642       /* if we need to byte-swap the value read: */
643       if (rw->tme_recode_rw_memory_size > TME_RECODE_SIZE_8
644 	  && rw->tme_recode_rw_memory_endian != TME_ENDIAN_NATIVE) {
645 
646 	/* if this is a double-host-size read: */
647 	if (TME_RECODE_SIZE_IS_DOUBLE_HOST(rw->tme_recode_rw_memory_size)) {
648 
649 	  /* swap the guest register halves: */
650 	  _tme_recode_x86_emit_reg_binop(thunk_bytes,
651 					 TME_RECODE_X86_OPCODE_BINOP_XCHG,
652 					 tme_recode_x86_reg_from_host[reg_host_value_0],
653 					 tme_recode_x86_reg_from_host[reg_host_value_1]);
654 
655 	  /* byte-swap the most-significant half of the guest register: */
656 	  thunk_bytes
657 	    = _tme_recode_x86_rw_bswap(thunk_bytes,
658 				       TME_RECODE_SIZE_HOST,
659 				       tme_recode_x86_reg_from_host[reg_host_value_1]);
660 	}
661 
662 	/* byte-swap the (least-significant half of the) guest register: */
663 	thunk_bytes
664 	  = _tme_recode_x86_rw_bswap(thunk_bytes,
665 				     TME_MIN(rw->tme_recode_rw_memory_size,
666 					     TME_RECODE_SIZE_HOST),
667 				     tme_recode_x86_reg_from_host[reg_host_value_0]);
668 
669 	/* if the read is sign-extended from smaller than host-sized: */
670 	if (rw->tme_recode_rw_reg_size > rw->tme_recode_rw_memory_size
671 	    && rw->tme_recode_rw_memory_signed
672 	    && rw->tme_recode_rw_memory_size < TME_RECODE_SIZE_HOST) {
673 
674 	  /* sign-extend the value read to host size: */
675 	  tme_recode_x86_insns_finish(ic, thunk_bytes);
676 	  insn_buffer.tme_recode_insn_opcode = TME_RECODE_OPCODE_EXTS;
677 	  insn_buffer.tme_recode_insn_operand_src[0] = reg_host_value_0;
678 	  insn_buffer.tme_recode_insn_operand_src[1] = rw->tme_recode_rw_memory_size;
679 	  insn_buffer.tme_recode_insn_operand_dst = reg_host_value_0;
680 	  _tme_recode_x86_insn_ext(ic, &insn_buffer);
681 	  tme_recode_x86_insns_start(ic, thunk_bytes);
682 	}
683       }
684 
685       /* if this is a double-host-size read that needs zero- or sign-extension: */
686       if (TME_RECODE_SIZE_IS_DOUBLE_HOST(rw->tme_recode_rw_reg_size)
687 	  && rw->tme_recode_rw_reg_size > rw->tme_recode_rw_memory_size) {
688 
689 	/* if memory is signed: */
690 	if (rw->tme_recode_rw_memory_signed) {
691 
692 	  /* sign-extend the value read: */
693 	  _tme_recode_x86_emit_reg_binop(thunk_bytes,
694 					 TME_RECODE_X86_OPCODE_BINOP_MOV,
695 					 tme_recode_x86_reg_from_host[reg_host_value_0],
696 					 tme_recode_x86_reg_from_host[reg_host_value_1]);
697 	  _tme_recode_x86_emit_reg_binop(thunk_bytes,
698 					 TME_RECODE_X86_OPCODE_BINOP_ADD,
699 					 tme_recode_x86_reg_from_host[reg_host_value_0],
700 					 tme_recode_x86_reg_from_host[reg_host_value_1]);
701 	  _tme_recode_x86_emit_reg_binop(thunk_bytes,
702 					 TME_RECODE_X86_OPCODE_BINOP_SBB,
703 					 tme_recode_x86_reg_from_host[reg_host_value_1],
704 					 tme_recode_x86_reg_from_host[reg_host_value_1]);
705 	}
706 
707 	/* otherwise, memory is unsigned: */
708 	else {
709 
710 	  /* zero-extend the value read: */
711 	  /* NB: we always make this a 32-bit operation, to try to
712 	     prevent a rex prefix: */
713 	  rex = (TME_RECODE_X86_REX_R(TME_RECODE_SIZE_32,
714 				      tme_recode_x86_reg_from_host[reg_host_value_1])
715 		 | TME_RECODE_X86_REX_B(TME_RECODE_SIZE_32,
716 					tme_recode_x86_reg_from_host[reg_host_value_1]));
717 	  if (rex != 0) {
718 	    *(thunk_bytes++) = rex;
719 	  }
720 	  thunk_bytes[0]
721 	    = (TME_RECODE_X86_OPCODE_BINOP_XOR
722 	       + TME_RECODE_X86_OPCODE_BINOP_Gv_Ev);
723 	  thunk_bytes[1]
724 	    = TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_REG(tme_recode_x86_reg_from_host[reg_host_value_1]),
725 					  TME_RECODE_X86_REG(tme_recode_x86_reg_from_host[reg_host_value_1]));
726 	  thunk_bytes += 2;
727 	}
728       }
729     }
730 
731     /* finish these instructions: */
732     tme_recode_x86_insns_finish(ic, thunk_bytes);
733 
734     /* unbusy the TLB entry: */
735     _tme_recode_x86_tlb_unbusy(ic,
736 			       x86_tlb_type.tme_recode_tlb_type.tme_recode_tlb_type_offset_token);
737 
738     /* start more instructions: */
739     tme_recode_x86_insns_start(ic, thunk_bytes);
740 
741     /* return to the instructions thunk: */
742     *(thunk_bytes++) = TME_RECODE_X86_OPCODE_RET;
743 
744     /* finish these instructions: */
745     tme_recode_x86_insns_finish(ic, thunk_bytes);
746   }
747 
748   if (x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp != NULL) {
749 
750     /* finish the assist conditional jump above, now
751        that we are at the target: */
752     _tme_recode_x86_fixup_jmp(x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp,
753 			      ic->tme_recode_ic_thunk_build_next);
754 
755     /* start more instructions: */
756     tme_recode_x86_insns_start(ic, thunk_bytes);
757 
758     /* exclusive-or the TLB entry page offset with the
759        (least-significant half of the) TLB entry page, to convert the
760        TLB entry page offset back into the (least-significant half of
761        the) guest address: */
762     thunk_bytes
763       = _tme_recode_x86_tlb_ref(thunk_bytes,
764 				TME_RECODE_SIZE_HOST,
765 				(TME_RECODE_X86_OPCODE_BINOP_XOR
766 				 + TME_RECODE_X86_OPCODE_BINOP_Ev_Gv),
767 				TME_RECODE_X86_REG_TLB,
768 				x86_tlb_type.tme_recode_tlb_type.tme_recode_tlb_type_offset_page,
769 				TME_RECODE_X86_REG(reg_x86_address));
770 
771     /* finish these instructions: */
772     tme_recode_x86_insns_finish(ic, thunk_bytes);
773   }
774 
775   /* fix up any double-host-size address assist: */
776   if (x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp_address_ok != NULL) {
777     _tme_recode_x86_fixup_jmp(x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp_address_ok,
778 			      ic->tme_recode_ic_thunk_build_next);
779   }
780 
781   if (x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp_address_ok != NULL
782       || x86_tlb_type.tme_recode_x86_tlb_type_assist_jmp != NULL) {
783 
784     /* unbusy the TLB entry: */
785     _tme_recode_x86_tlb_unbusy(ic,
786 			       x86_tlb_type.tme_recode_tlb_type.tme_recode_tlb_type_offset_token);
787   }
788 
789   /* start more instructions: */
790   tme_recode_x86_insns_start(ic, thunk_bytes);
791 
792   /* if this is an ia32 host: */
793   if (TME_RECODE_SIZE_HOST == TME_RECODE_SIZE_32) {
794 
795     /* if this is a write: */
796     if (rw->tme_recode_rw_write) {
797 
798       /* push the value argument for the guest function.  NB that if
799 	 double-host-size guests are supported, but this isn't a
800 	 double-host-size guest, we use a garbage word on the stack as
801 	 the most-significant half of this argument (which is okay
802 	 since the guest functions are supposed to truncate their
803 	 arguments to the expected size): */
804       if (TME_RECODE_SIZE_GUEST_MAX > TME_RECODE_SIZE_HOST) {
805 	_tme_recode_x86_emit_reg_push(thunk_bytes, tme_recode_x86_reg_from_host[TME_RECODE_REG_HOST(0) + 1]);
806       }
807       _tme_recode_x86_emit_reg_push(thunk_bytes, tme_recode_x86_reg_from_host[TME_RECODE_REG_HOST(0)]);
808     }
809 
810     /* push the address argument for the guest function.  NB that if
811        double-host-size guests are supported, but this isn't a
812        double-host-size guest, we use a garbage word on the stack as
813        the most-significant half of this argument (which is okay since
814        the guest functions are supposed to truncate their arguments to
815        the expected size): */
816     if (TME_RECODE_SIZE_GUEST_MAX > TME_RECODE_SIZE_HOST) {
817       _tme_recode_x86_emit_reg_push(thunk_bytes, TME_RECODE_X86_REG_A);
818     }
819     _tme_recode_x86_emit_reg_push(thunk_bytes, reg_x86_address);
820 
821     /* emit the instruction to push the struct tme_ic * argument for
822        the guest function, and then the call instruction to the guest
823        function: */
824     *((tme_uint16_t *) thunk_bytes)
825       = (TME_RECODE_X86_OPCODE_PUSH_Gv(TME_RECODE_X86_REG_IC)
826 	 + (TME_RECODE_X86_OPCODE_CALL_RELz << 8));
827     thunk_bytes += 2 + sizeof(tme_uint32_t);
828     ((tme_int32_t *) thunk_bytes)[-1]
829       = (tme_recode_function_to_thunk_off(ic, rw->tme_recode_rw_guest_func_read)
830 	 - tme_recode_build_to_thunk_off(ic, thunk_bytes));
831 
832     /* remove the guest function arguments from the stack: */
833     thunk_bytes
834       = _tme_recode_x86_emit_adjust_sp(thunk_bytes,
835 				       (sizeof(struct tme_ic *)
836 					+ sizeof(tme_recode_uguest_t)
837 					+ (sizeof(tme_recode_uguest_t)
838 					   * !!rw->tme_recode_rw_write)));
839   }
840 
841   /* otherwise, this is an x86-64 host: */
842   else {
843 
844     /* push the caller-saved registers that aren't normally destroyed
845        by a read/write thunk: */
846     _tme_recode_x86_emit_reg_push(thunk_bytes, TME_RECODE_X86_REG_N(10));
847     _tme_recode_x86_emit_reg_push(thunk_bytes, TME_RECODE_X86_REG_N(11));
848     _tme_recode_x86_emit_reg_push(thunk_bytes, TME_RECODE_X86_REG_DI);
849     _tme_recode_x86_emit_reg_push(thunk_bytes, TME_RECODE_X86_REG_SI);
850     _tme_recode_x86_emit_reg_push(thunk_bytes, TME_RECODE_X86_REG_N(8));
851     _tme_recode_x86_emit_reg_push(thunk_bytes, TME_RECODE_X86_REG_N(9));
852 
853     /* make the struct tme_ic * argument for the guest function: */
854     _tme_recode_x86_emit_reg_copy(thunk_bytes, TME_RECODE_X86_REG_IC, TME_RECODE_X86_REG_DI);
855 
856     /* make the address argument for the guest function.  NB that if
857        double-host-size guests are supported, but this isn't a
858        double-host-size guest, we use a garbage word as the
859        most-significant half of this argument (which is okay since the
860        guest functions are supposed to truncate their arguments to the
861        expected size): */
862     _tme_recode_x86_emit_reg_copy(thunk_bytes, reg_x86_address, TME_RECODE_X86_REG_SI);
863     if (TME_RECODE_SIZE_IS_DOUBLE_HOST(ic->tme_recode_ic_reg_size)) {
864       _tme_recode_x86_emit_reg_copy(thunk_bytes, TME_RECODE_X86_REG_A, TME_RECODE_X86_REG_D);
865     }
866 
867     /* if this is a write: */
868     if (rw->tme_recode_rw_write) {
869 
870       /* make the value argument for the guest function.  NB that if
871 	 double-host-size guests are supported, but this isn't a
872 	 double-host-size guest, we use a garbage word as the
873 	 most-significant half of this argument (which is okay since
874 	 the guest functions are supposed to truncate their arguments
875 	 to the expected size): */
876       _tme_recode_x86_emit_reg_copy(thunk_bytes,
877 				    tme_recode_x86_reg_from_host[TME_RECODE_REG_HOST(0)],
878 				    (TME_RECODE_SIZE_GUEST_MAX <= TME_RECODE_SIZE_HOST
879 				     ? TME_RECODE_X86_REG_D
880 				     : TME_RECODE_X86_REG_C));
881       if (TME_RECODE_SIZE_IS_DOUBLE_HOST(ic->tme_recode_ic_reg_size)) {
882 	_tme_recode_x86_emit_reg_copy(thunk_bytes,
883 				      tme_recode_x86_reg_from_host[TME_RECODE_REG_HOST(0)],
884 				      TME_RECODE_X86_REG_N(8));
885       }
886     }
887 
888     /* we must assume that we can't reach the guest function from the
889        instruction thunk with a 32-bit displacement.  emit a direct
890        call to the guest function using %rax: */
891     *((tme_uint16_t *) thunk_bytes)
892       = (TME_RECODE_X86_REX_B(TME_RECODE_SIZE_HOST, TME_RECODE_X86_REG_A)
893 	 + (TME_RECODE_X86_OPCODE_MOV_Iv_Gv(TME_RECODE_X86_REG_A)
894 	    << 8));
895     memcpy(thunk_bytes + 2,
896 	   &rw->tme_recode_rw_guest_func_write,
897 	   TME_BIT(TME_RECODE_SIZE_HOST - TME_RECODE_SIZE_8));
898     thunk_bytes += 2 + TME_BIT(TME_RECODE_SIZE_HOST - TME_RECODE_SIZE_8);
899     *((tme_uint16_t *) thunk_bytes)
900       = (TME_RECODE_X86_OPCODE_GRP5
901 	 + (TME_RECODE_X86_MOD_OPREG_RM(TME_RECODE_X86_MOD_RM_REG(TME_RECODE_X86_REG_A),
902 					TME_RECODE_X86_OPCODE_GRP5_CALL)
903 	    << 8));
904     thunk_bytes += 2;
905 
906     /* pop the caller-saved registers that aren't normally destroyed
907        by a read/write thunk: */
908     _tme_recode_x86_emit_reg_pop(thunk_bytes, TME_RECODE_X86_REG_N(9));
909     _tme_recode_x86_emit_reg_pop(thunk_bytes, TME_RECODE_X86_REG_N(8));
910     _tme_recode_x86_emit_reg_pop(thunk_bytes, TME_RECODE_X86_REG_SI);
911     _tme_recode_x86_emit_reg_pop(thunk_bytes, TME_RECODE_X86_REG_DI);
912     _tme_recode_x86_emit_reg_pop(thunk_bytes, TME_RECODE_X86_REG_N(11));
913     _tme_recode_x86_emit_reg_pop(thunk_bytes, TME_RECODE_X86_REG_N(10));
914   }
915 
916   /* if this is a read: */
917   if (!rw->tme_recode_rw_write) {
918 
919     /* if this is a double-host-size guest: */
920     if (TME_RECODE_SIZE_IS_DOUBLE_HOST(ic->tme_recode_ic_reg_size)) {
921 
922       /* move the value read into the expected return registers: */
923       _tme_recode_x86_emit_reg_copy(thunk_bytes,
924 				    TME_RECODE_X86_REG_A,
925 				    tme_recode_x86_reg_from_host[reg_host_value_0]);
926       _tme_recode_x86_emit_reg_copy(thunk_bytes,
927 				    TME_RECODE_X86_REG_D,
928 				    tme_recode_x86_reg_from_host[reg_host_value_1]);
929     }
930 
931     /* otherwise, this is not a double-host-size guest: */
932     else {
933 
934       /* the value read should already be in the expected register: */
935       assert (tme_recode_x86_reg_from_host[reg_host_value_0] == TME_RECODE_X86_REG_A);
936     }
937   }
938 
939   /* return to the instructions thunk: */
940   *(thunk_bytes++) = TME_RECODE_X86_OPCODE_RET;
941 
942   /* finish these instructions: */
943   tme_recode_x86_insns_finish(ic, thunk_bytes);
944 
945   /* finish this read/write thunk: */
946   tme_recode_host_thunk_finish(ic);
947 
948   /* no further extension is needed for this read/write thunk: */
949   rw_thunk->tme_recode_x86_rw_thunk_extend_size = 0;
950 
951   return (rw_thunk);
952 }
953 
954 /* this host function tries to duplicate a read/write thunk: */
955 struct tme_recode_rw_thunk *
tme_recode_host_rw_thunk_dup(struct tme_recode_ic * ic,const struct tme_recode_rw * rw,const struct tme_recode_rw * rw_other)956 tme_recode_host_rw_thunk_dup(struct tme_recode_ic *ic,
957 			     const struct tme_recode_rw *rw,
958 			     const struct tme_recode_rw *rw_other)
959 {
960   tme_uint8_t *thunk_bytes_0;
961   struct tme_recode_insn insn_buffer;
962   tme_uint8_t *thunk_bytes_1;
963   struct tme_recode_rw_thunk *rw_thunk;
964 
965   /* start more instructions, so if we use _tme_recode_x86_insn_ext()
966      to emit an extension instruction, we can discard it from the
967      thunk build memory: */
968   tme_recode_x86_insns_start(ic, thunk_bytes_0);
969 
970   /* if our register size is the same as the memory size, or
971      if the existing read/write thunk will do the extension that
972      we need to at least our register size: */
973   /* NB: a read/write thunk always extends to at least host size, and
974      when no particular extension is explicitly required,
975      zero-extension is the default: */
976   if ((rw->tme_recode_rw_reg_size
977        == rw_other->tme_recode_rw_memory_size)
978       || ((((rw_other->tme_recode_rw_reg_size
979 	     > rw_other->tme_recode_rw_memory_size)
980 	    && rw_other->tme_recode_rw_memory_signed)
981 	   == !!rw->tme_recode_rw_memory_signed)
982 	  && (TME_MAX(rw_other->tme_recode_rw_reg_size, TME_RECODE_SIZE_HOST)
983 	      >= rw->tme_recode_rw_reg_size))) {
984 
985     /* we can reuse the existing read/write thunk, and we don't need
986        to do any extension: */
987     /* nothing to do */
988   }
989 
990   /* otherwise, if this is a double-host-size read: */
991   else if (TME_RECODE_SIZE_IS_DOUBLE_HOST(rw->tme_recode_rw_reg_size)) {
992 
993     /* we won't reuse the existing read/write thunk: */
994     return (NULL);
995   }
996 
997   /* otherwise, we will reuse this read/write thunk, with a single
998      zero- or sign-extension instruction after each call: */
999   else {
1000 
1001     /* use _tme_recode_x86_insn_ext() to emit the extension
1002        instruction: */
1003     insn_buffer.tme_recode_insn_opcode
1004       = (rw->tme_recode_rw_memory_signed
1005 	 ? TME_RECODE_OPCODE_EXTS
1006 	 : TME_RECODE_OPCODE_EXTZ);
1007     insn_buffer.tme_recode_insn_operand_src[0] = TME_RECODE_X86_REG_HOST_FREE_CALL;
1008     insn_buffer.tme_recode_insn_operand_src[1] = rw->tme_recode_rw_memory_size;
1009     insn_buffer.tme_recode_insn_operand_dst = insn_buffer.tme_recode_insn_operand_src[0];
1010     _tme_recode_x86_insn_ext(ic, &insn_buffer);
1011   }
1012 
1013   /* duplicate the read/write thunk: */
1014   rw_thunk = tme_dup(struct tme_recode_rw_thunk, rw_other->tme_recode_rw_thunk, 1);
1015 
1016   /* get any extension instruction from the thunk build memory, and
1017      then discard it: */
1018   tme_recode_x86_insns_start(ic, thunk_bytes_1);
1019   rw_thunk->tme_recode_x86_rw_thunk_extend = *((tme_uint32_t *) thunk_bytes_0);
1020   rw_thunk->tme_recode_x86_rw_thunk_extend_size = (thunk_bytes_1 - thunk_bytes_0);
1021   assert (rw_thunk->tme_recode_x86_rw_thunk_extend_size <= sizeof(tme_uint32_t));
1022   tme_recode_x86_insns_finish(ic, thunk_bytes_0);
1023 
1024   /* return the duplicated read/write thunk: */
1025   return (rw_thunk);
1026 }
1027