1%%% -*- erlang-indent-level: 2 -*-
2%%%
3%%% Licensed under the Apache License, Version 2.0 (the "License");
4%%% you may not use this file except in compliance with the License.
5%%% You may obtain a copy of the License at
6%%%
7%%%     http://www.apache.org/licenses/LICENSE-2.0
8%%%
9%%% Unless required by applicable law or agreed to in writing, software
10%%% distributed under the License is distributed on an "AS IS" BASIS,
11%%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12%%% See the License for the specific language governing permissions and
13%%% limitations under the License.
14%%%
15%%% HiPE/x86 assembler
16%%%
17%%% TODO:
18%%% - Simplify combine_label_maps and mk_data_relocs.
19
20-ifdef(HIPE_AMD64).
21-define(HIPE_X86_ASSEMBLE,  hipe_amd64_assemble).
22-define(HIPE_X86_ENCODE,    hipe_amd64_encode).
23-define(HIPE_X86_REGISTERS, hipe_amd64_registers).
24-define(HIPE_X86_PP,        hipe_amd64_pp).
25-ifdef(AMD64_SIMULATE_NSP).
26-define(X86_SIMULATE_NSP, ?AMD64_SIMULATE_NSP).
27-endif.
28-define(EAX, rax).
29-define(REGArch, reg64).
30-define(RMArch, rm64).
31-define(EA_DISP32_ABSOLUTE, ea_disp32_sindex).
32-else.
33-define(HIPE_X86_ASSEMBLE,  hipe_x86_assemble).
34-define(HIPE_X86_ENCODE,    hipe_x86_encode).
35-define(HIPE_X86_REGISTERS, hipe_x86_registers).
36-define(HIPE_X86_PP,        hipe_x86_pp).
37-define(EAX, eax).
38-define(REGArch, reg32).
39-define(RMArch, rm32).
40-define(EA_DISP32_ABSOLUTE, ea_disp32).
41-endif.
42
43-module(?HIPE_X86_ASSEMBLE).
44-export([assemble/4]).
45
46-define(DEBUG,true).
47
48-include("../main/hipe.hrl").
49-include("../x86/hipe_x86.hrl").
50-include("../../kernel/src/hipe_ext_format.hrl").
51-include("../rtl/hipe_literals.hrl").
52-include("../misc/hipe_sdi.hrl").
53-undef(ASSERT).
54-define(ASSERT(G), if G -> [] ; true -> exit({assertion_failed,?MODULE,?LINE,??G}) end).
55
56assemble(CompiledCode, Closures, Exports, Options) ->
57  ?when_option(time, Options, ?start_timer("x86 assembler")),
58  print("****************** Assembling *******************\n", [], Options),
59  %%
60  Code = [{MFA,
61	   hipe_x86:defun_code(Defun),
62	   hipe_x86:defun_data(Defun)}
63	  || {MFA, Defun} <- CompiledCode],
64  %%
65  {ConstAlign,ConstSize,ConstMap,RefsFromConsts} =
66    hipe_pack_constants:pack_constants(Code),
67  %%
68  {CodeSize,CodeBinary,AccRefs,LabelMap,ExportMap} =
69    encode(translate(Code, ConstMap, Options), Options),
70  print("Total num bytes=~w\n", [CodeSize], Options),
71  %% put(code_size, CodeSize),
72  %% put(const_size, ConstSize),
73  %% ?when_option(verbose, Options,
74  %%	       ?debug_msg("Constants are ~w bytes\n",[ConstSize])),
75  %%
76  SC = hipe_pack_constants:slim_constmap(ConstMap),
77  DataRelocs = hipe_pack_constants:mk_data_relocs(RefsFromConsts, LabelMap),
78  SSE = hipe_pack_constants:slim_sorted_exportmap(ExportMap,Closures,Exports),
79  SlimRefs = hipe_pack_constants:slim_refs(AccRefs),
80  Bin = term_to_binary([{?VERSION_STRING(),?HIPE_ERTS_CHECKSUM},
81			ConstAlign, ConstSize,
82			SC,
83			DataRelocs, % nee LM, LabelMap
84			SSE,
85			CodeSize,CodeBinary,SlimRefs,
86			0,[] % ColdCodeSize, SlimColdRefs
87		       ]),
88  %%
89  %% ?when_option(time, Options, ?stop_timer("x86 assembler")),
90  Bin.
91
92%%%
93%%% Assembly Pass 1.
94%%% Process initial {MFA,Code,Data} list.
95%%% Translate each MFA's body, choosing operand & instruction kinds.
96%%%
97%%% Assembly Pass 2.
98%%% Perform short/long form optimisation for jumps.
99%%% Build LabelMap for each MFA.
100%%%
101%%% Result is {MFA,NewCode,CodeSize,LabelMap} list.
102%%%
103
104translate(Code, ConstMap, Options) ->
105  translate_mfas(Code, ConstMap, [], Options).
106
107translate_mfas([{MFA,Insns,_Data}|Code], ConstMap, NewCode, Options) ->
108  {NewInsns,CodeSize,LabelMap} =
109    translate_insns(Insns, {MFA,ConstMap}, hipe_sdi:pass1_init(), 0, [], Options),
110  translate_mfas(Code, ConstMap, [{MFA,NewInsns,CodeSize,LabelMap}|NewCode], Options);
111translate_mfas([], _ConstMap, NewCode, _Options) ->
112  lists:reverse(NewCode).
113
114translate_insns([I|Insns], Context, SdiPass1, Address, NewInsns, Options) ->
115  NewIs = translate_insn(I, Context, Options),
116  add_insns(NewIs, Insns, Context, SdiPass1, Address, NewInsns, Options);
117translate_insns([], _Context, SdiPass1, Address, NewInsns, _Options) ->
118  {LabelMap,CodeSizeIncr} = hipe_sdi:pass2(SdiPass1),
119  {lists:reverse(NewInsns), Address+CodeSizeIncr, LabelMap}.
120
121add_insns([I|Is], Insns, Context, SdiPass1, Address, NewInsns, Options) ->
122  NewSdiPass1 =
123    case I of
124      {'.label',L,_} ->
125	hipe_sdi:pass1_add_label(SdiPass1, Address, L);
126      {jcc_sdi,{_,{label,L}},_} ->
127	SdiInfo = #sdi_info{incr=(6-2),lb=(-128)+2,ub=127+2},
128	hipe_sdi:pass1_add_sdi(SdiPass1, Address, L, SdiInfo);
129      {jmp_sdi,{{label,L}},_} ->
130	SdiInfo = #sdi_info{incr=(5-2),lb=(-128)+2,ub=127+2},
131	hipe_sdi:pass1_add_sdi(SdiPass1, Address, L, SdiInfo);
132      _ ->
133	SdiPass1
134    end,
135  Address1 = Address + insn_size(I),
136  add_insns(Is, Insns, Context, NewSdiPass1, Address1, [I|NewInsns], Options);
137add_insns([], Insns, Context, SdiPass1, Address, NewInsns, Options) ->
138  translate_insns(Insns, Context, SdiPass1, Address, NewInsns, Options).
139
140insn_size(I) ->
141  case I of
142    {'.label',_,_} -> 0;
143    {'.sdesc',_,_} -> 0;
144    {jcc_sdi,_,_} -> 2;
145    {jmp_sdi,_,_} -> 2;
146    {Op,Arg,_Orig} -> ?HIPE_X86_ENCODE:insn_sizeof(Op, Arg)
147  end.
148
149translate_insn(I, Context, Options) ->
150  case I of
151    #alu{aluop='xor', src=#x86_temp{reg=Reg}=Src, dst=#x86_temp{reg=Reg}=Dst} ->
152      [{'xor', {temp_to_reg32(Dst), temp_to_rm32(Src)}, I}];
153    #alu{} ->
154      Arg = resolve_alu_args(hipe_x86:alu_src(I), hipe_x86:alu_dst(I), Context),
155      [{hipe_x86:alu_op(I), Arg, I}];
156    #call{} ->
157      translate_call(I);
158    #cmovcc{} ->
159      {Dst,Src} = resolve_move_args(
160		    hipe_x86:cmovcc_src(I), hipe_x86:cmovcc_dst(I),
161		    Context),
162      CC = {cc,?HIPE_X86_ENCODE:cc(hipe_x86:cmovcc_cc(I))},
163      Arg = {CC,Dst,Src},
164      [{cmovcc, Arg, I}];
165    #cmp{} ->
166      Arg = resolve_alu_args(hipe_x86:cmp_src(I), hipe_x86:cmp_dst(I), Context),
167      [{cmp, Arg, I}];
168    #comment{} ->
169      [];
170    #fmove{} ->
171      {Op,Arg} = resolve_sse2_fmove_args(hipe_x86:fmove_src(I),
172					 hipe_x86:fmove_dst(I)),
173      [{Op, Arg, I}];
174    #fp_binop{} ->
175      case proplists:get_bool(x87, Options) of
176	true ->  % x87
177	  Arg = resolve_x87_binop_args(hipe_x86:fp_binop_src(I),
178				       hipe_x86:fp_binop_dst(I)),
179	  [{hipe_x86:fp_binop_op(I), Arg, I}];
180	false -> % sse2
181	  Arg = resolve_sse2_binop_args(hipe_x86:fp_binop_src(I),
182					hipe_x86:fp_binop_dst(I)),
183	  [{resolve_sse2_op(hipe_x86:fp_binop_op(I)), Arg, I}]
184      end;
185    #fp_unop{} ->
186      case proplists:get_bool(x87, Options) of
187	true ->  % x87
188	  Arg = resolve_x87_unop_arg(hipe_x86:fp_unop_arg(I)),
189	  [{hipe_x86:fp_unop_op(I), Arg, I}];
190	false -> % sse2
191	  case hipe_x86:fp_unop_op(I) of
192	    'fchs' ->
193	      Arg = resolve_sse2_fchs_arg(hipe_x86:fp_unop_arg(I)),
194	      [{'xorpd', Arg, I}];
195	    'fwait' -> % no op on sse2, magic on x87
196	      []
197	  end
198      end;
199    #imul{} ->
200      translate_imul(I, Context);
201    #jcc{} ->
202      Cc = {cc,?HIPE_X86_ENCODE:cc(hipe_x86:jcc_cc(I))},
203      Label = translate_label(hipe_x86:jcc_label(I)),
204      [{jcc_sdi, {Cc,Label}, I}];
205    #jmp_fun{} ->
206      %% call and jmp are patched the same, so no need to distinguish
207      %% call from tailcall
208      PatchTypeExt =
209	case hipe_x86:jmp_fun_linkage(I) of
210	  remote -> ?CALL_REMOTE;
211	  not_remote -> ?CALL_LOCAL
212	end,
213      Arg = translate_fun(hipe_x86:jmp_fun_fun(I), PatchTypeExt),
214      [{jmp, {Arg}, I}];
215    #jmp_label{} ->
216      Arg = translate_label(hipe_x86:jmp_label_label(I)),
217      [{jmp_sdi, {Arg}, I}];
218    #jmp_switch{} ->
219      RM32 = resolve_jmp_switch_arg(I, Context),
220      [{jmp, {RM32}, I}];
221    #label{} ->
222      [{'.label', hipe_x86:label_label(I), I}];
223    #lea{} ->
224      Arg = resolve_lea_args(hipe_x86:lea_mem(I), hipe_x86:lea_temp(I)),
225      [{lea, Arg, I}];
226    #move{} ->
227      Arg = resolve_move_args(hipe_x86:move_src(I), hipe_x86:move_dst(I),
228			      Context),
229      [{mov, Arg, I}];
230    #move64{} ->
231      translate_move64(I, Context);
232    #movsx{} ->
233      Src = resolve_movx_src(hipe_x86:movsx_src(I)),
234      [{movsx, {temp_to_regArch(hipe_x86:movsx_dst(I)), Src}, I}];
235    #movzx{} ->
236      Src = resolve_movx_src(hipe_x86:movzx_src(I)),
237      [{movzx, {temp_to_reg32(hipe_x86:movzx_dst(I)), Src}, I}];
238    %% pseudo_call: eliminated before assembly
239    %% pseudo_jcc: eliminated before assembly
240    %% pseudo_tailcall: eliminated before assembly
241    %% pseudo_tailcall_prepare: eliminated before assembly
242    #pop{} ->
243      Arg = translate_dst(hipe_x86:pop_dst(I)),
244      [{pop, {Arg}, I}];
245    #push{} ->
246      Arg = translate_src(hipe_x86:push_src(I), Context),
247      [{push, {Arg}, I}];
248    #ret{} ->
249      translate_ret(I);
250    #shift{} ->
251      Arg = resolve_shift_args(hipe_x86:shift_src(I), hipe_x86:shift_dst(I), Context),
252      [{hipe_x86:shift_op(I), Arg, I}];
253    #test{} ->
254      Arg = resolve_test_args(hipe_x86:test_src(I), hipe_x86:test_dst(I), Context),
255      [{test, Arg, I}]
256  end.
257
258-ifdef(X86_SIMULATE_NSP).
259-ifdef(HIPE_AMD64).
260translate_call(I) ->
261  WordSize = hipe_amd64_registers:wordsize(),
262  RegSP = 2#100, % esp/rsp
263  TempSP = hipe_x86:mk_temp(RegSP, untagged),
264  FunOrig = hipe_x86:call_fun(I),
265  Fun =
266    case FunOrig of
267      #x86_mem{base=#x86_temp{reg=4}, off=#x86_imm{value=Off}} ->
268	FunOrig#x86_mem{off=#x86_imm{value=Off+WordSize}};
269      _ -> FunOrig
270    end,
271  RegRA =
272    begin
273      RegTemp0 = hipe_amd64_registers:temp0(),
274      RegTemp1 = hipe_amd64_registers:temp1(),
275      case Fun of
276	#x86_temp{reg=RegTemp0} -> RegTemp1;
277	#x86_mem{base=#x86_temp{reg=RegTemp0}} -> RegTemp1;
278	_ -> RegTemp0
279      end
280    end,
281  TempRA = hipe_x86:mk_temp(RegRA, untagged),
282  PatchTypeExt =
283    case hipe_x86:call_linkage(I) of
284      remote -> ?CALL_REMOTE;
285      not_remote -> ?CALL_LOCAL
286    end,
287  JmpArg = translate_fun(Fun, PatchTypeExt),
288  I4 = {'.sdesc', hipe_x86:call_sdesc(I), #comment{term=sdesc}},
289  I3 = {jmp, {JmpArg}, #comment{term=call}},
290  Size3 = hipe_amd64_encode:insn_sizeof(jmp, {JmpArg}),
291  MovArgs = {mem_to_rmArch(hipe_x86:mk_mem(TempSP,
292					     hipe_x86:mk_imm(0),
293					     untagged)),
294	     temp_to_regArch(TempRA)},
295  I2 = {mov, MovArgs, #comment{term=call}},
296  Size2 = hipe_amd64_encode:insn_sizeof(mov, MovArgs),
297  I1 = {lea, {temp_to_regArch(TempRA),
298	      {ea, hipe_amd64_encode:ea_disp32_rip(Size2+Size3)}},
299	#comment{term=call}},
300  I0 = {sub, {temp_to_rmArch(TempSP), {imm8,WordSize}}, I},
301  [I0,I1,I2,I3,I4].
302-else.
303translate_call(I) ->
304  WordSize = ?HIPE_X86_REGISTERS:wordsize(),
305  RegSP = 2#100, % esp/rsp
306  TempSP = hipe_x86:mk_temp(RegSP, untagged),
307  FunOrig = hipe_x86:call_fun(I),
308  Fun =
309    case FunOrig of
310      #x86_mem{base=#x86_temp{reg=4}, off=#x86_imm{value=Off}} ->
311	FunOrig#x86_mem{off=#x86_imm{value=Off+WordSize}};
312      _ -> FunOrig
313    end,
314  PatchTypeExt =
315    case hipe_x86:call_linkage(I) of
316      remote -> ?CALL_REMOTE;
317      not_remote -> ?CALL_LOCAL
318    end,
319  JmpArg = translate_fun(Fun, PatchTypeExt),
320  I3 = {'.sdesc', hipe_x86:call_sdesc(I), #comment{term=sdesc}},
321  I2 = {jmp, {JmpArg}, #comment{term=call}},
322  Size2 = ?HIPE_X86_ENCODE:insn_sizeof(jmp, {JmpArg}),
323  I1 = {mov, {mem_to_rmArch(hipe_x86:mk_mem(TempSP,
324					  hipe_x86:mk_imm(0),
325					  untagged)),
326	      {imm32,{?X86ABSPCREL,4+Size2}}},
327	#comment{term=call}},
328  I0 = {sub, {temp_to_rmArch(TempSP), {imm8,WordSize}}, I},
329  [I0,I1,I2,I3].
330-endif.
331
332translate_ret(I) ->
333  NPOP = hipe_x86:ret_npop(I) + ?HIPE_X86_REGISTERS:wordsize(),
334  RegSP = 2#100, % esp/rsp
335  TempSP = hipe_x86:mk_temp(RegSP, untagged),
336  RegRA = 2#011, % ebx/rbx
337  TempRA = hipe_x86:mk_temp(RegRA, untagged),
338  [{mov,
339    {temp_to_regArch(TempRA),
340     mem_to_rmArch(hipe_x86:mk_mem(TempSP,
341				   hipe_x86:mk_imm(0),
342				   untagged))},
343    I},
344   {add,
345    {temp_to_rmArch(TempSP),
346     case NPOP < 128 of
347       true -> {imm8,NPOP};
348       false -> {imm32,NPOP}
349     end},
350    #comment{term=ret}},
351   {jmp,
352    {temp_to_rmArch(TempRA)},
353    #comment{term=ret}}].
354
355-else. % not X86_SIMULATE_NSP
356
357translate_call(I) ->
358  %% call and jmp are patched the same, so no need to distinguish
359  %% call from tailcall
360  PatchTypeExt =
361    case hipe_x86:call_linkage(I) of
362      remote -> ?CALL_REMOTE;
363      not_remote -> ?CALL_LOCAL
364    end,
365  Arg = translate_fun(hipe_x86:call_fun(I), PatchTypeExt),
366  SDesc = hipe_x86:call_sdesc(I),
367  [{call, {Arg}, I}, {'.sdesc', SDesc, #comment{term=sdesc}}].
368
369translate_ret(I) ->
370  Arg =
371    case hipe_x86:ret_npop(I) of
372      0 -> {};
373      N -> {{imm16,N}}
374    end,
375  [{ret, Arg, I}].
376
377-endif. % X86_SIMULATE_NSP
378
379translate_imul(I, Context) ->
380  Temp = temp_to_regArch(hipe_x86:imul_temp(I)),
381  Src = temp_or_mem_to_rmArch(hipe_x86:imul_src(I)),
382  Args =
383    case hipe_x86:imul_imm_opt(I) of
384      [] -> {Temp,Src};
385      Imm -> {Temp,Src,translate_imm(Imm, Context, true)}
386    end,
387  [{'imul', Args, I}].
388
389temp_or_mem_to_rmArch(Src) ->
390  case Src of
391    #x86_temp{} -> temp_to_rmArch(Src);
392    #x86_mem{} -> mem_to_rmArch(Src)
393  end.
394
395translate_label(Label) when is_integer(Label) ->
396  {label,Label}.	% symbolic, since offset is not yet computable
397
398translate_fun(Arg, PatchTypeExt) ->
399  case Arg of
400    #x86_temp{} ->
401      temp_to_rmArch(Arg);
402    #x86_mem{} ->
403      mem_to_rmArch(Arg);
404    #x86_mfa{m=M,f=F,a=A} ->
405      {rel32,{PatchTypeExt,{M,F,A}}};
406    #x86_prim{prim=Prim} ->
407      {rel32,{PatchTypeExt,Prim}}
408  end.
409
410translate_src(Src, Context) ->
411  case Src of
412    #x86_imm{} ->
413      translate_imm(Src, Context, true);
414    _ ->
415      translate_dst(Src)
416  end.
417
418%%% MayTrunc8 controls whether negative Imm8s should be truncated
419%%% to 8 bits or not. Truncation should always be done, except when
420%%% the caller will widen the Imm8 to an Imm32 or Imm64.
421translate_imm(#x86_imm{value=Imm}, Context, MayTrunc8) ->
422  if is_atom(Imm) ->
423      {imm32,{?LOAD_ATOM,Imm}};
424     is_integer(Imm) ->
425      case (Imm =< 127) and (Imm >= -128) of
426	true ->
427	  Imm8 =
428	    case MayTrunc8 of
429	      true -> Imm band 16#FF;
430	      false -> Imm
431	    end,
432	  {imm8,Imm8};
433	false ->
434	  {imm32,Imm}
435      end;
436     true ->
437      Val =
438	case Imm of
439	  {Label,constant} ->
440	    {MFA,ConstMap} = Context,
441	    ConstNo = hipe_pack_constants:find_const({MFA,Label}, ConstMap),
442	    {constant,ConstNo};
443	  {Label,closure} ->
444	    {closure,Label};
445	  {Label,c_const} ->
446	    {c_const,Label}
447	end,
448      {imm32,{?LOAD_ADDRESS,Val}}
449  end.
450
451translate_dst(Dst) ->
452  case Dst of
453    #x86_temp{} ->
454      temp_to_regArch(Dst);
455    #x86_mem{type='double'} ->
456      mem_to_rm64fp(Dst);
457    #x86_mem{} ->
458      mem_to_rmArch(Dst);
459    #x86_fpreg{} ->
460      fpreg_to_stack(Dst)
461  end.
462
463%%%
464%%% Assembly Pass 3.
465%%% Process final {MFA,Code,CodeSize,LabelMap} list from pass 2.
466%%% Translate to a single binary code segment.
467%%% Collect relocation patches.
468%%% Build ExportMap (MFA-to-address mapping).
469%%% Combine LabelMaps to a single one (for mk_data_relocs/2 compatibility).
470%%% Return {CombinedCodeSize,BinaryCode,Relocs,CombinedLabelMap,ExportMap}.
471%%%
472
473encode(Code, Options) ->
474  CodeSize = compute_code_size(Code, 0),
475  ExportMap = build_export_map(Code, 0, []),
476  {AccCode,Relocs} = encode_mfas(Code, 0, [], [], Options),
477  CodeBinary = list_to_binary(lists:reverse(AccCode)),
478  ?ASSERT(CodeSize =:= byte_size(CodeBinary)),
479  CombinedLabelMap = combine_label_maps(Code, 0, gb_trees:empty()),
480  {CodeSize,CodeBinary,Relocs,CombinedLabelMap,ExportMap}.
481
482nr_pad_bytes(Address) -> (4 - (Address rem 4)) rem 4. % XXX: 16 or 32 instead?
483
484align_entry(Address) -> Address + nr_pad_bytes(Address).
485
486compute_code_size([{_MFA,_Insns,CodeSize,_LabelMap}|Code], Size) ->
487  compute_code_size(Code, align_entry(Size+CodeSize));
488compute_code_size([], Size) -> Size.
489
490build_export_map([{{M,F,A},_Insns,CodeSize,_LabelMap}|Code], Address, ExportMap) ->
491  build_export_map(Code, align_entry(Address+CodeSize), [{Address,M,F,A}|ExportMap]);
492build_export_map([], _Address, ExportMap) -> ExportMap.
493
494combine_label_maps([{MFA,_Insns,CodeSize,LabelMap}|Code], Address, CLM) ->
495  NewCLM = merge_label_map(gb_trees:to_list(LabelMap), MFA, Address, CLM),
496  combine_label_maps(Code, align_entry(Address+CodeSize), NewCLM);
497combine_label_maps([], _Address, CLM) -> CLM.
498
499merge_label_map([{Label,Offset}|Rest], MFA, Address, CLM) ->
500  NewCLM = gb_trees:insert({MFA,Label}, Address+Offset, CLM),
501  merge_label_map(Rest, MFA, Address, NewCLM);
502merge_label_map([], _MFA, _Address, CLM) -> CLM.
503
504encode_mfas([{MFA,Insns,CodeSize,LabelMap}|Code], Address, AccCode, Relocs, Options) ->
505  print("Generating code for:~w\n", [MFA], Options),
506  print("Offset   | Opcode                   | Instruction\n", [], Options),
507  {Address1,Relocs1,AccCode1} =
508    encode_insns(Insns, Address, Address, LabelMap, Relocs, AccCode, Options),
509  ExpectedAddress = align_entry(Address + CodeSize),
510  ?ASSERT(Address1 =:= ExpectedAddress),
511  print("Finished.\n\n", [], Options),
512  encode_mfas(Code, Address1, AccCode1, Relocs1, Options);
513encode_mfas([], _Address, AccCode, Relocs, _Options) ->
514  {AccCode, Relocs}.
515
516encode_insns([I|Insns], Address, FunAddress, LabelMap, Relocs, AccCode, Options) ->
517  case I of
518    {'.label',L,_} ->
519      LabelAddress = gb_trees:get(L, LabelMap) + FunAddress,
520      ?ASSERT(Address =:= LabelAddress),	% sanity check
521      print_insn(Address, [], I, Options),
522      encode_insns(Insns, Address, FunAddress, LabelMap, Relocs, AccCode, Options);
523    {'.sdesc',SDesc,_} ->
524      #x86_sdesc{exnlab=ExnLab,fsize=FSize,arity=Arity,live=Live} = SDesc,
525      ExnRA =
526	case ExnLab of
527	  [] -> [];	% don't cons up a new one
528	  ExnLab -> gb_trees:get(ExnLab, LabelMap) + FunAddress
529	end,
530      Reloc = {?SDESC, Address,
531	       ?STACK_DESC(ExnRA, FSize, Arity, Live)},
532      encode_insns(Insns, Address, FunAddress, LabelMap, [Reloc|Relocs], AccCode, Options);
533    _ ->
534      {Op,Arg,_} = fix_jumps(I, Address, FunAddress, LabelMap),
535      {Bytes, NewRelocs} = ?HIPE_X86_ENCODE:insn_encode(Op, Arg, Address),
536      print_insn(Address, Bytes, I, Options),
537      Segment = list_to_binary(Bytes),
538      Size = byte_size(Segment),
539      NewAccCode = [Segment|AccCode],
540      encode_insns(Insns, Address+Size, FunAddress, LabelMap, NewRelocs++Relocs, NewAccCode, Options)
541  end;
542encode_insns([], Address, FunAddress, LabelMap, Relocs, AccCode, Options) ->
543  case nr_pad_bytes(Address) of
544    0 ->
545      {Address,Relocs,AccCode};
546    NrPadBytes ->	% triggers at most once per function body
547      Padding = lists:duplicate(NrPadBytes, {nop,{},#comment{term=padding}}),
548      encode_insns(Padding, Address, FunAddress, LabelMap, Relocs, AccCode, Options)
549  end.
550
551fix_jumps(I, InsnAddress, FunAddress, LabelMap) ->
552  case I of
553    {jcc_sdi,{CC,{label,L}},OrigI} ->
554      LabelAddress = gb_trees:get(L, LabelMap) + FunAddress,
555      ShortOffset = LabelAddress - (InsnAddress + 2),
556      if is_integer(ShortOffset), ShortOffset >= -128, ShortOffset =< 127 ->
557	  {jcc,{CC,{rel8,ShortOffset band 16#FF}},OrigI};
558	 true ->
559	  LongOffset = LabelAddress - (InsnAddress + 6),
560	  {jcc,{CC,{rel32,LongOffset}},OrigI}
561      end;
562    {jmp_sdi,{{label,L}},OrigI} ->
563      LabelAddress = gb_trees:get(L, LabelMap) + FunAddress,
564      ShortOffset = LabelAddress - (InsnAddress + 2),
565      if is_integer(ShortOffset), ShortOffset >= -128, ShortOffset =< 127 ->
566	  {jmp,{{rel8,ShortOffset band 16#FF}},OrigI};
567	 true ->
568	  LongOffset = LabelAddress - (InsnAddress + 5),
569	  {jmp,{{rel32,LongOffset}},OrigI}
570      end;
571    _ -> I
572  end.
573
574%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
575
576fpreg_to_stack(#x86_fpreg{reg=Reg}) ->
577  {fpst, Reg}.
578
579temp_to_regArch(#x86_temp{reg=Reg}) ->
580  {?REGArch, Reg}.
581
582-ifdef(HIPE_AMD64).
583temp_to_reg64(#x86_temp{reg=Reg}) ->
584  {reg64, Reg}.
585-endif.
586
587temp_to_reg32(#x86_temp{reg=Reg}) ->
588  {reg32, Reg}.
589temp_to_reg16(#x86_temp{reg=Reg}) ->
590  {reg16, Reg}.
591temp_to_reg8(#x86_temp{reg=Reg}) ->
592  {reg8, Reg}.
593
594temp_to_xmm(#x86_temp{reg=Reg}) ->
595  {xmm, Reg}.
596
597-ifdef(HIPE_AMD64).
598temp_to_rm8(#x86_temp{reg=Reg}) ->
599  {rm8, ?HIPE_X86_ENCODE:rm_reg(Reg)}.
600temp_to_rm64(#x86_temp{reg=Reg}) ->
601  {rm64, hipe_amd64_encode:rm_reg(Reg)}.
602-else.
603temp_to_rm8(#x86_temp{reg=Reg}) ->
604  true = ?HIPE_X86_ENCODE:reg_has_8bit(Reg),
605  {rm8, ?HIPE_X86_ENCODE:rm_reg(Reg)}.
606temp_to_rm16(#x86_temp{reg=Reg}) ->
607  {rm16, ?HIPE_X86_ENCODE:rm_reg(Reg)}.
608-endif.
609
610temp_to_rm32(#x86_temp{reg=Reg}) ->
611  {rm32, ?HIPE_X86_ENCODE:rm_reg(Reg)}.
612temp_to_rmArch(#x86_temp{reg=Reg}) ->
613  {?RMArch, ?HIPE_X86_ENCODE:rm_reg(Reg)}.
614temp_to_rm64fp(#x86_temp{reg=Reg}) ->
615  {rm64fp, ?HIPE_X86_ENCODE:rm_reg(Reg)}.
616
617mem_to_ea(Mem) ->
618  EA = mem_to_ea_common(Mem),
619  {ea, EA}.
620
621mem_to_rm32(Mem) ->
622  EA = mem_to_ea_common(Mem),
623  {rm32, ?HIPE_X86_ENCODE:rm_mem(EA)}.
624
625mem_to_rmArch(Mem) ->
626  EA = mem_to_ea_common(Mem),
627  {?RMArch, ?HIPE_X86_ENCODE:rm_mem(EA)}.
628
629mem_to_rm64fp(Mem) ->
630  EA = mem_to_ea_common(Mem),
631  {rm64fp, ?HIPE_X86_ENCODE:rm_mem(EA)}.
632
633%%%%%%%%%%%%%%%%%
634mem_to_rm8(Mem) ->
635  EA = mem_to_ea_common(Mem),
636  {rm8, ?HIPE_X86_ENCODE:rm_mem(EA)}.
637
638mem_to_rm16(Mem) ->
639  EA = mem_to_ea_common(Mem),
640  {rm16, ?HIPE_X86_ENCODE:rm_mem(EA)}.
641%%%%%%%%%%%%%%%%%
642
643mem_to_ea_common(#x86_mem{base=[], off=#x86_imm{value=Off}}) ->
644  ?HIPE_X86_ENCODE:?EA_DISP32_ABSOLUTE(Off);
645mem_to_ea_common(#x86_mem{base=#x86_temp{reg=Base}, off=#x86_temp{reg=Index}}) ->
646  case Base band 2#111 of
647    5 -> % ebp/rbp or r13
648      case Index band 2#111 of
649	5 -> % ebp/rbp or r13
650	  SINDEX = ?HIPE_X86_ENCODE:sindex(0, Index),
651	  SIB = ?HIPE_X86_ENCODE:sib(Base, SINDEX),
652	  ?HIPE_X86_ENCODE:ea_disp8_sib(0, SIB);
653	_ ->
654	  SINDEX = ?HIPE_X86_ENCODE:sindex(0, Base),
655	  SIB = ?HIPE_X86_ENCODE:sib(Index, SINDEX),
656	  ?HIPE_X86_ENCODE:ea_sib(SIB)
657      end;
658    _ ->
659      SINDEX = ?HIPE_X86_ENCODE:sindex(0, Index),
660      SIB = ?HIPE_X86_ENCODE:sib(Base, SINDEX),
661      ?HIPE_X86_ENCODE:ea_sib(SIB)
662  end;
663mem_to_ea_common(#x86_mem{base=#x86_temp{reg=Base}, off=#x86_imm{value=Off}}) ->
664  if
665    Off =:= 0 ->
666      case Base of
667	4 -> %esp, use SIB w/o disp8
668	  SIB = ?HIPE_X86_ENCODE:sib(Base),
669	  ?HIPE_X86_ENCODE:ea_sib(SIB);
670	5 -> %ebp, use disp8 w/o SIB
671	  ?HIPE_X86_ENCODE:ea_disp8_base(Off, Base);
672        12 -> %r12, use SIB w/o disp8
673	  SIB = ?HIPE_X86_ENCODE:sib(Base),
674	  ?HIPE_X86_ENCODE:ea_sib(SIB);
675        13 -> %r13, use disp8 w/o SIB
676 	  ?HIPE_X86_ENCODE:ea_disp8_base(Off, Base);
677	_ -> %neither SIB nor disp8 needed
678	  ?HIPE_X86_ENCODE:ea_base(Base)
679      end;
680    Off >= -128, Off =< 127 ->
681      Disp8 = Off band 16#FF,
682      case Base of
683	4 -> %esp, must use SIB
684	  SIB = ?HIPE_X86_ENCODE:sib(Base),
685	  ?HIPE_X86_ENCODE:ea_disp8_sib(Disp8, SIB);
686        12 -> %r12, must use SIB
687	  SIB = ?HIPE_X86_ENCODE:sib(Base),
688	  ?HIPE_X86_ENCODE:ea_disp8_sib(Disp8, SIB);
689	_ -> %use disp8 w/o SIB
690	  ?HIPE_X86_ENCODE:ea_disp8_base(Disp8, Base)
691      end;
692    true ->
693      case Base of
694	4 -> %esp, must use SIB
695	  SIB = ?HIPE_X86_ENCODE:sib(Base),
696	  ?HIPE_X86_ENCODE:ea_disp32_sib(Off, SIB);
697	12 -> %r12, must use SIB
698	  SIB = ?HIPE_X86_ENCODE:sib(Base),
699	  ?HIPE_X86_ENCODE:ea_disp32_sib(Off, SIB);
700	_ ->
701	  ?HIPE_X86_ENCODE:ea_disp32_base(Off, Base)
702      end
703  end.
704
705%% jmp_switch
706-ifdef(HIPE_AMD64).
707resolve_jmp_switch_arg(I, _Context) ->
708  Base = hipe_x86:temp_reg(hipe_x86:jmp_switch_jtab(I)),
709  Index = hipe_x86:temp_reg(hipe_x86:jmp_switch_temp(I)),
710  SINDEX = hipe_amd64_encode:sindex(3, Index),
711  SIB = hipe_amd64_encode:sib(Base, SINDEX),
712  EA =
713    if (Base =:= 5) or (Base =:= 13) ->
714	hipe_amd64_encode:ea_disp8_sib(0, SIB);
715       true ->
716	hipe_amd64_encode:ea_sib(SIB)
717    end,
718  {rm64,hipe_amd64_encode:rm_mem(EA)}.
719-else.
720resolve_jmp_switch_arg(I, {MFA,ConstMap}) ->
721  ConstNo = hipe_pack_constants:find_const({MFA,hipe_x86:jmp_switch_jtab(I)}, ConstMap),
722  Disp32 = {?LOAD_ADDRESS,{constant,ConstNo}},
723  SINDEX = ?HIPE_X86_ENCODE:sindex(2, hipe_x86:temp_reg(hipe_x86:jmp_switch_temp(I))),
724  EA = ?HIPE_X86_ENCODE:ea_disp32_sindex(Disp32, SINDEX), % this creates a SIB implicitly
725  {rm32,?HIPE_X86_ENCODE:rm_mem(EA)}.
726-endif.
727
728%% lea reg, mem
729resolve_lea_args(Src=#x86_mem{}, Dst=#x86_temp{}) ->
730  {temp_to_regArch(Dst),mem_to_ea(Src)}.
731
732resolve_sse2_op(Op) ->
733  case Op of
734    fadd -> addsd;
735    fdiv -> divsd;
736    fmul -> mulsd;
737    fsub -> subsd;
738    xorpd -> xorpd;
739    _ -> exit({?MODULE, unknown_sse2_operator, Op})
740  end.
741
742%% OP xmm, mem
743resolve_sse2_binop_args(Src=#x86_mem{type=double},
744			Dst=#x86_temp{type=double}) ->
745  {temp_to_xmm(Dst),mem_to_rm64fp(Src)};
746%% movsd mem, xmm
747resolve_sse2_binop_args(Src=#x86_temp{type=double},
748			Dst=#x86_mem{type=double}) ->
749  {mem_to_rm64fp(Dst),temp_to_xmm(Src)};
750%% OP xmm, xmm
751resolve_sse2_binop_args(Src=#x86_temp{type=double},
752			Dst=#x86_temp{type=double}) ->
753  {temp_to_xmm(Dst),temp_to_rm64fp(Src)}.
754
755%%% fmove -> cvtsi2sd or movsd
756resolve_sse2_fmove_args(Src, Dst) ->
757  case {Src,Dst} of
758    {#x86_temp{type=untagged}, #x86_temp{type=double}} -> % cvtsi2sd xmm, reg
759      {cvtsi2sd, {temp_to_xmm(Dst),temp_to_rmArch(Src)}};
760    {#x86_mem{type=untagged}, #x86_temp{type=double}} -> % cvtsi2sd xmm, mem
761      {cvtsi2sd, {temp_to_xmm(Dst),mem_to_rmArch(Src)}};
762    _ -> % movsd
763      {movsd, resolve_sse2_binop_args(Src, Dst)}
764  end.
765
766%%% xorpd xmm, mem
767resolve_sse2_fchs_arg(Dst=#x86_temp{type=double}) ->
768  {temp_to_xmm(Dst),
769   {rm64fp, {rm_mem, ?HIPE_X86_ENCODE:?EA_DISP32_ABSOLUTE(
770		       {?LOAD_ADDRESS,
771			{c_const, sse2_fnegate_mask}})}}}.
772
773%% mov mem, imm
774resolve_move_args(#x86_imm{value=ImmSrc}, Dst=#x86_mem{type=Type}, Context) ->
775  case Type of   % to support byte, int16 and int32 stores
776    byte ->
777      ByteImm = ImmSrc band 255, %to ensure that it is a bytesized imm
778      {mem_to_rm8(Dst),{imm8,ByteImm}};
779    int16 ->
780      {mem_to_rm16(Dst),{imm16,ImmSrc band 16#FFFF}};
781    int32 ->
782      {_,Imm} = translate_imm(#x86_imm{value=ImmSrc}, Context, false),
783      {mem_to_rm32(Dst),{imm32,Imm}};
784    _ ->
785      RMArch = mem_to_rmArch(Dst),
786      {_,Imm} = translate_imm(#x86_imm{value=ImmSrc}, Context, false),
787      {RMArch,{imm32,Imm}}
788  end;
789
790%% mov reg,mem
791resolve_move_args(Src=#x86_mem{type=Type}, Dst=#x86_temp{}, _Context) ->
792  case Type of
793    int32 -> % must be unsigned
794      {temp_to_reg32(Dst),mem_to_rm32(Src)};
795    _ ->
796      {temp_to_regArch(Dst),mem_to_rmArch(Src)}
797  end;
798
799%% mov mem,reg
800resolve_move_args(Src=#x86_temp{}, Dst=#x86_mem{type=Type}, _Context) ->
801  case Type of   % to support byte, int16 and int32 stores
802    byte ->
803      {mem_to_rm8(Dst),temp_to_reg8(Src)};
804    int16 ->
805      {mem_to_rm16(Dst),temp_to_reg16(Src)};
806    int32 ->
807      {mem_to_rm32(Dst),temp_to_reg32(Src)};
808    tagged -> % tagged, untagged
809      {mem_to_rmArch(Dst),temp_to_regArch(Src)};
810    untagged -> % tagged, untagged
811      {mem_to_rmArch(Dst),temp_to_regArch(Src)}
812  end;
813
814%% mov reg,reg
815resolve_move_args(Src=#x86_temp{}, Dst=#x86_temp{}, _Context) ->
816  {temp_to_regArch(Dst),temp_to_rmArch(Src)};
817
818%% mov reg,imm
819resolve_move_args(Src=#x86_imm{value=_ImmSrc}, Dst=#x86_temp{}, Context) ->
820  {_,Imm} = translate_imm(Src, Context, false),
821  imm_move_args(Dst, Imm).
822
823-ifdef(HIPE_AMD64).
824imm_move_args(Dst, Imm) ->
825  if is_number(Imm), Imm >= 0 ->
826      {temp_to_reg32(Dst),{imm32,Imm}};
827     true ->
828      {temp_to_rm64(Dst),{imm32,Imm}}
829  end.
830-else.
831imm_move_args(Dst, Imm) ->
832  {temp_to_reg32(Dst),{imm32,Imm}}.
833-endif.
834
835-ifdef(HIPE_AMD64).
836translate_move64(I, Context) ->
837  Arg = resolve_move64_args(hipe_x86:move64_src(I),
838			    hipe_x86:move64_dst(I),
839			    Context),
840  [{mov, Arg, I}].
841
842%% mov reg,imm64
843resolve_move64_args(Src=#x86_imm{}, Dst=#x86_temp{}, Context) ->
844  {_,Imm} = translate_imm(Src, Context, false),
845  {temp_to_reg64(Dst),{imm64,Imm}}.
846-else.
847translate_move64(I, _Context) -> exit({?MODULE, I}).
848-endif.
849
850%%% mov{s,z}x
851resolve_movx_src(Src=#x86_mem{type=Type}) ->
852  case Type of
853    byte ->
854      mem_to_rm8(Src);
855    int16 ->
856      mem_to_rm16(Src);
857    int32 ->
858      mem_to_rm32(Src)
859  end.
860
861%%% alu/cmp (_not_ test)
862resolve_alu_args(Src, Dst, Context) ->
863  case {Src,Dst} of
864    {#x86_imm{}, #x86_mem{}} ->
865      {mem_to_rmArch(Dst), translate_imm(Src, Context, true)};
866    {#x86_mem{}, #x86_temp{}} ->
867      {temp_to_regArch(Dst), mem_to_rmArch(Src)};
868    {#x86_temp{}, #x86_mem{}} ->
869      {mem_to_rmArch(Dst), temp_to_regArch(Src)};
870    {#x86_temp{}, #x86_temp{}} ->
871      {temp_to_regArch(Dst), temp_to_rmArch(Src)};
872    {#x86_imm{}, #x86_temp{reg=0}} -> % eax,imm
873      NewSrc = translate_imm(Src, Context, true),
874      NewDst =
875	case NewSrc of
876	  {imm8,_} -> temp_to_rmArch(Dst);
877	  {imm32,_} -> ?EAX
878	end,
879      {NewDst, NewSrc};
880    {#x86_imm{}, #x86_temp{}} ->
881      {temp_to_rmArch(Dst), translate_imm(Src, Context, true)}
882  end.
883
884%%% test
885resolve_test_args(Src, Dst, Context) ->
886  case Src of
887    %% Since we're using an 8-bit instruction, the immediate is not sign
888    %% extended. Thus, we can use immediates up to 255.
889    #x86_imm{value=ImmVal}
890      when is_integer(ImmVal), ImmVal >= 0, ImmVal =< 255 ->
891      Imm = {imm8, ImmVal},
892      case Dst of
893	#x86_temp{reg=0} -> {al, Imm};
894	#x86_temp{} -> resolve_test_imm8_reg(Imm, Dst);
895	#x86_mem{} -> {mem_to_rm8(Dst), Imm}
896      end;
897    #x86_imm{value=ImmVal} when is_integer(ImmVal), ImmVal >= 0 ->
898      {case Dst of
899	 #x86_temp{reg=0} -> eax;
900	 #x86_temp{} -> temp_to_rm32(Dst);
901	 #x86_mem{} -> mem_to_rm32(Dst)
902       end, {imm32, ImmVal}};
903    #x86_imm{} -> % Negative ImmVal; use word-sized instr, imm32
904      {_, ImmVal} = translate_imm(Src, Context, false),
905      {case Dst of
906	 #x86_temp{reg=0} -> ?EAX;
907	 #x86_temp{} -> temp_to_rmArch(Dst);
908	 #x86_mem{} -> mem_to_rmArch(Dst)
909       end, {imm32, ImmVal}};
910    #x86_temp{} ->
911      NewDst =
912	case Dst of
913	  #x86_temp{} -> temp_to_rmArch(Dst);
914	  #x86_mem{} -> mem_to_rmArch(Dst)
915	end,
916      {NewDst, temp_to_regArch(Src)}
917  end.
918
919-ifdef(HIPE_AMD64).
920resolve_test_imm8_reg(Imm, Dst) -> {temp_to_rm8(Dst), Imm}.
921-else.
922resolve_test_imm8_reg(Imm = {imm8, ImmVal}, Dst = #x86_temp{reg=Reg}) ->
923  case ?HIPE_X86_ENCODE:reg_has_8bit(Reg) of
924    true -> {temp_to_rm8(Dst), Imm};
925    false ->
926      %% Register does not exist in 8-bit version; use 16-bit instead
927      {temp_to_rm16(Dst), {imm16, ImmVal}}
928  end.
929-endif.
930
931%%% shifts
932resolve_shift_args(Src, Dst, Context) ->
933  RM32 =
934    case Dst of
935      #x86_temp{} -> temp_to_rmArch(Dst);
936      #x86_mem{} -> mem_to_rmArch(Dst)
937    end,
938  Count =
939    case Src of
940      #x86_imm{value=1} -> 1;
941      #x86_imm{} -> translate_imm(Src, Context, true); % must be imm8
942      #x86_temp{reg=1} -> cl	% temp must be ecx
943    end,
944  {RM32, Count}.
945
946%% x87_binop mem
947resolve_x87_unop_arg(Arg=#x86_mem{type=Type})->
948  case Type of
949    'double' -> {mem_to_rm64fp(Arg)};
950    'untagged' -> {mem_to_rmArch(Arg)};
951    _ -> ?EXIT({fmovArgNotSupported,{Arg}})
952  end;
953resolve_x87_unop_arg(Arg=#x86_fpreg{}) ->
954  {fpreg_to_stack(Arg)};
955resolve_x87_unop_arg([]) ->
956  [].
957
958%% x87_binop mem, st(i)
959resolve_x87_binop_args(Src=#x86_fpreg{}, Dst=#x86_mem{})->
960  {mem_to_rm64fp(Dst),fpreg_to_stack(Src)};
961%% x87_binop st(0), st(i)
962resolve_x87_binop_args(Src=#x86_fpreg{}, Dst=#x86_fpreg{})->
963  {fpreg_to_stack(Dst),fpreg_to_stack(Src)}.
964
965%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
966
967%%%
968%%% Assembly listing support (pp_asm option).
969%%%
970
971print(String, Arglist, Options) ->
972  ?when_option(pp_asm, Options, io:format(String, Arglist)).
973
974print_insn(Address, Bytes, I, Options) ->
975  ?when_option(pp_asm, Options, print_insn_2(Address, Bytes, I)),
976  ?when_option(pp_cxmon, Options, print_code_list_2(Bytes)).
977
978print_code_list_2([H | Tail]) ->
979  print_byte(H),
980  io:format(","),
981  print_code_list_2(Tail);
982print_code_list_2([]) ->
983  io:format("").
984
985print_insn_2(Address, Bytes, {_,_,OrigI}) ->
986  io:format("~8.16b | ", [Address]),
987  print_code_list(Bytes, 0),
988  ?HIPE_X86_PP:pp_insn(OrigI).
989
990print_code_list([Byte|Rest], Len) ->
991  print_byte(Byte),
992  print_code_list(Rest, Len+1);
993print_code_list([], Len) ->
994  fill_spaces(24-(Len*2)),
995  io:format(" | ").
996
997print_byte(Byte) ->
998  io:format("~2.16.0b", [Byte band 16#FF]).
999
1000fill_spaces(N) when N > 0 ->
1001  io:format(" "),
1002  fill_spaces(N-1);
1003fill_spaces(0) ->
1004  [].
1005