1{-# LANGUAGE CPP, TypeFamilies #-}
2
3-----------------------------------------------------------------------------
4--
5-- Machine-dependent assembly language
6--
7-- (c) The University of Glasgow 1993-2004
8--
9-----------------------------------------------------------------------------
10
11module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..),
12                  getJumpDestBlockId, canShortcut, shortcutStatics,
13                  shortcutJump, allocMoreStack,
14                  maxSpillSlots, archWordFormat )
15where
16
17#include "HsVersions.h"
18
19import GhcPrelude
20
21import X86.Cond
22import X86.Regs
23import Instruction
24import Format
25import RegClass
26import Reg
27import TargetReg
28
29import BlockId
30import Hoopl.Collections
31import Hoopl.Label
32import GHC.Platform.Regs
33import Cmm
34import FastString
35import Outputable
36import GHC.Platform
37
38import BasicTypes       (Alignment)
39import CLabel
40import DynFlags
41import UniqSet
42import Unique
43import UniqSupply
44import Debug (UnwindTable)
45
46import Control.Monad
47import Data.Maybe       (fromMaybe)
48
49-- Format of an x86/x86_64 memory address, in bytes.
50--
51archWordFormat :: Bool -> Format
52archWordFormat is32Bit
53 | is32Bit   = II32
54 | otherwise = II64
55
56-- | Instruction instance for x86 instruction set.
57instance Instruction Instr where
58        regUsageOfInstr         = x86_regUsageOfInstr
59        patchRegsOfInstr        = x86_patchRegsOfInstr
60        isJumpishInstr          = x86_isJumpishInstr
61        jumpDestsOfInstr        = x86_jumpDestsOfInstr
62        patchJumpInstr          = x86_patchJumpInstr
63        mkSpillInstr            = x86_mkSpillInstr
64        mkLoadInstr             = x86_mkLoadInstr
65        takeDeltaInstr          = x86_takeDeltaInstr
66        isMetaInstr             = x86_isMetaInstr
67        mkRegRegMoveInstr       = x86_mkRegRegMoveInstr
68        takeRegRegMoveInstr     = x86_takeRegRegMoveInstr
69        mkJumpInstr             = x86_mkJumpInstr
70        mkStackAllocInstr       = x86_mkStackAllocInstr
71        mkStackDeallocInstr     = x86_mkStackDeallocInstr
72
73
74-- -----------------------------------------------------------------------------
75-- Intel x86 instructions
76
77{-
78Intel, in their infinite wisdom, selected a stack model for floating
79point registers on x86.  That might have made sense back in 1979 --
80nowadays we can see it for the nonsense it really is.  A stack model
81fits poorly with the existing nativeGen infrastructure, which assumes
82flat integer and FP register sets.  Prior to this commit, nativeGen
83could not generate correct x86 FP code -- to do so would have meant
84somehow working the register-stack paradigm into the register
85allocator and spiller, which sounds very difficult.
86
87We have decided to cheat, and go for a simple fix which requires no
88infrastructure modifications, at the expense of generating ropey but
89correct FP code.  All notions of the x86 FP stack and its insns have
90been removed.  Instead, we pretend (to the instruction selector and
91register allocator) that x86 has six floating point registers, %fake0
92.. %fake5, which can be used in the usual flat manner.  We further
93claim that x86 has floating point instructions very similar to SPARC
94and Alpha, that is, a simple 3-operand register-register arrangement.
95Code generation and register allocation proceed on this basis.
96
97When we come to print out the final assembly, our convenient fiction
98is converted to dismal reality.  Each fake instruction is
99independently converted to a series of real x86 instructions.
100%fake0 .. %fake5 are mapped to %st(0) .. %st(5).  To do reg-reg
101arithmetic operations, the two operands are pushed onto the top of the
102FP stack, the operation done, and the result copied back into the
103relevant register.  There are only six %fake registers because 2 are
104needed for the translation, and x86 has 8 in total.
105
106The translation is inefficient but is simple and it works.  A cleverer
107translation would handle a sequence of insns, simulating the FP stack
108contents, would not impose a fixed mapping from %fake to %st regs, and
109hopefully could avoid most of the redundant reg-reg moves of the
110current translation.
111
112We might as well make use of whatever unique FP facilities Intel have
113chosen to bless us with (let's not be churlish, after all).
114Hence GLDZ and GLD1.  Bwahahahahahahaha!
115-}
116
117{-
118Note [x86 Floating point precision]
119
120Intel's internal floating point registers are by default 80 bit
121extended precision.  This means that all operations done on values in
122registers are done at 80 bits, and unless the intermediate values are
123truncated to the appropriate size (32 or 64 bits) by storing in
124memory, calculations in registers will give different results from
125calculations which pass intermediate values in memory (eg. via
126function calls).
127
128One solution is to set the FPU into 64 bit precision mode.  Some OSs
129do this (eg. FreeBSD) and some don't (eg. Linux).  The problem here is
130that this will only affect 64-bit precision arithmetic; 32-bit
131calculations will still be done at 64-bit precision in registers.  So
132it doesn't solve the whole problem.
133
134There's also the issue of what the C library is expecting in terms of
135precision.  It seems to be the case that glibc on Linux expects the
136FPU to be set to 80 bit precision, so setting it to 64 bit could have
137unexpected effects.  Changing the default could have undesirable
138effects on other 3rd-party library code too, so the right thing would
139be to save/restore the FPU control word across Haskell code if we were
140to do this.
141
142gcc's -ffloat-store gives consistent results by always storing the
143results of floating-point calculations in memory, which works for both
14432 and 64-bit precision.  However, it only affects the values of
145user-declared floating point variables in C, not intermediate results.
146GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision
147flag).
148
149Another problem is how to spill floating point registers in the
150register allocator.  Should we spill the whole 80 bits, or just 64?
151On an OS which is set to 64 bit precision, spilling 64 is fine.  On
152Linux, spilling 64 bits will round the results of some operations.
153This is what gcc does.  Spilling at 80 bits requires taking up a full
154128 bit slot (so we get alignment).  We spill at 80-bits and ignore
155the alignment problems.
156
157In the future [edit: now available in GHC 7.0.1, with the -msse2
158flag], we'll use the SSE registers for floating point.  This requires
159a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision
160float ops), which means P4 or Xeon and above.  Using SSE will solve
161all these problems, because the SSE registers use fixed 32 bit or 64
162bit precision.
163
164--SDM 1/2003
165-}
166
167data Instr
168        -- comment pseudo-op
169        = COMMENT FastString
170
171        -- location pseudo-op (file, line, col, name)
172        | LOCATION Int Int Int String
173
174        -- some static data spat out during code
175        -- generation.  Will be extracted before
176        -- pretty-printing.
177        | LDATA   Section (Alignment, CmmStatics)
178
179        -- start a new basic block.  Useful during
180        -- codegen, removed later.  Preceding
181        -- instruction should be a jump, as per the
182        -- invariants for a BasicBlock (see Cmm).
183        | NEWBLOCK BlockId
184
185        -- unwinding information
186        -- See Note [Unwinding information in the NCG].
187        | UNWIND CLabel UnwindTable
188
189        -- specify current stack offset for benefit of subsequent passes.
190        -- This carries a BlockId so it can be used in unwinding information.
191        | DELTA  Int
192
193        -- Moves.
194        | MOV         Format Operand Operand
195        | CMOV   Cond Format Operand Reg
196        | MOVZxL      Format Operand Operand -- format is the size of operand 1
197        | MOVSxL      Format Operand Operand -- format is the size of operand 1
198        -- x86_64 note: plain mov into a 32-bit register always zero-extends
199        -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which
200        -- don't affect the high bits of the register.
201
202        -- Load effective address (also a very useful three-operand add instruction :-)
203        | LEA         Format Operand Operand
204
205        -- Int Arithmetic.
206        | ADD         Format Operand Operand
207        | ADC         Format Operand Operand
208        | SUB         Format Operand Operand
209        | SBB         Format Operand Operand
210
211        | MUL         Format Operand Operand
212        | MUL2        Format Operand         -- %edx:%eax = operand * %rax
213        | IMUL        Format Operand Operand -- signed int mul
214        | IMUL2       Format Operand         -- %edx:%eax = operand * %eax
215
216        | DIV         Format Operand         -- eax := eax:edx/op, edx := eax:edx%op
217        | IDIV        Format Operand         -- ditto, but signed
218
219        -- Int Arithmetic, where the effects on the condition register
220        -- are important. Used in specialized sequences such as MO_Add2.
221        -- Do not rewrite these instructions to "equivalent" ones that
222        -- have different effect on the condition register! (See #9013.)
223        | ADD_CC      Format Operand Operand
224        | SUB_CC      Format Operand Operand
225
226        -- Simple bit-twiddling.
227        | AND         Format Operand Operand
228        | OR          Format Operand Operand
229        | XOR         Format Operand Operand
230        | NOT         Format Operand
231        | NEGI        Format Operand         -- NEG instruction (name clash with Cond)
232        | BSWAP       Format Reg
233
234        -- Shifts (amount may be immediate or %cl only)
235        | SHL         Format Operand{-amount-} Operand
236        | SAR         Format Operand{-amount-} Operand
237        | SHR         Format Operand{-amount-} Operand
238
239        | BT          Format Imm Operand
240        | NOP
241
242
243        -- We need to support the FSTP (x87 store and pop) instruction
244        -- so that we can correctly read off the return value of an
245        -- x86 CDECL C function call when its floating point.
246        -- so we dont include a register argument, and just use st(0)
247        -- this instruction is used ONLY for return values of C ffi calls
248        -- in x86_32 abi
249        | X87Store         Format  AddrMode -- st(0), dst
250
251
252        -- SSE2 floating point: we use a restricted set of the available SSE2
253        -- instructions for floating-point.
254        -- use MOV for moving (either movss or movsd (movlpd better?))
255        | CVTSS2SD      Reg Reg            -- F32 to F64
256        | CVTSD2SS      Reg Reg            -- F64 to F32
257        | CVTTSS2SIQ    Format Operand Reg -- F32 to I32/I64 (with truncation)
258        | CVTTSD2SIQ    Format Operand Reg -- F64 to I32/I64 (with truncation)
259        | CVTSI2SS      Format Operand Reg -- I32/I64 to F32
260        | CVTSI2SD      Format Operand Reg -- I32/I64 to F64
261
262        -- use ADD, SUB, and SQRT for arithmetic.  In both cases, operands
263        -- are  Operand Reg.
264
265        -- SSE2 floating-point division:
266        | FDIV          Format Operand Operand   -- divisor, dividend(dst)
267
268        -- use CMP for comparisons.  ucomiss and ucomisd instructions
269        -- compare single/double prec floating point respectively.
270
271        | SQRT          Format Operand Reg      -- src, dst
272
273
274        -- Comparison
275        | TEST          Format Operand Operand
276        | CMP           Format Operand Operand
277        | SETCC         Cond Operand
278
279        -- Stack Operations.
280        | PUSH          Format Operand
281        | POP           Format Operand
282        -- both unused (SDM):
283        --  | PUSHA
284        --  | POPA
285
286        -- Jumping around.
287        | JMP         Operand [Reg] -- including live Regs at the call
288        | JXX         Cond BlockId  -- includes unconditional branches
289        | JXX_GBL     Cond Imm      -- non-local version of JXX
290        -- Table jump
291        | JMP_TBL     Operand   -- Address to jump to
292                      [Maybe JumpDest] -- Targets of the jump table
293                      Section   -- Data section jump table should be put in
294                      CLabel    -- Label of jump table
295        -- | X86 call instruction
296        | CALL        (Either Imm Reg) -- ^ Jump target
297                      [Reg]            -- ^ Arguments (required for register allocation)
298
299        -- Other things.
300        | CLTD Format            -- sign extend %eax into %edx:%eax
301
302        | FETCHGOT    Reg        -- pseudo-insn for ELF position-independent code
303                                 -- pretty-prints as
304                                 --       call 1f
305                                 -- 1:    popl %reg
306                                 --       addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg
307        | FETCHPC     Reg        -- pseudo-insn for Darwin position-independent code
308                                 -- pretty-prints as
309                                 --       call 1f
310                                 -- 1:    popl %reg
311
312    -- bit counting instructions
313        | POPCNT      Format Operand Reg -- [SSE4.2] count number of bits set to 1
314        | LZCNT       Format Operand Reg -- [BMI2] count number of leading zeros
315        | TZCNT       Format Operand Reg -- [BMI2] count number of trailing zeros
316        | BSF         Format Operand Reg -- bit scan forward
317        | BSR         Format Operand Reg -- bit scan reverse
318
319    -- bit manipulation instructions
320        | PDEP        Format Operand Operand Reg -- [BMI2] deposit bits to   the specified mask
321        | PEXT        Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
322
323    -- prefetch
324        | PREFETCH  PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
325                                        -- variant can be NTA, Lvl0, Lvl1, or Lvl2
326
327        | LOCK        Instr -- lock prefix
328        | XADD        Format Operand Operand -- src (r), dst (r/m)
329        | CMPXCHG     Format Operand Operand -- src (r), dst (r/m), eax implicit
330        | MFENCE
331
332data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
333
334
335data Operand
336        = OpReg  Reg            -- register
337        | OpImm  Imm            -- immediate value
338        | OpAddr AddrMode       -- memory reference
339
340
341
342-- | Returns which registers are read and written as a (read, written)
343-- pair.
344x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
345x86_regUsageOfInstr platform instr
346 = case instr of
347    MOV    _ src dst    -> usageRW src dst
348    CMOV _ _ src dst    -> mkRU (use_R src [dst]) [dst]
349    MOVZxL _ src dst    -> usageRW src dst
350    MOVSxL _ src dst    -> usageRW src dst
351    LEA    _ src dst    -> usageRW src dst
352    ADD    _ src dst    -> usageRM src dst
353    ADC    _ src dst    -> usageRM src dst
354    SUB    _ src dst    -> usageRM src dst
355    SBB    _ src dst    -> usageRM src dst
356    IMUL   _ src dst    -> usageRM src dst
357
358    -- Result of IMULB will be in just in %ax
359    IMUL2  II8 src       -> mkRU (eax:use_R src []) [eax]
360    -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and
361    -- %ax/%eax/%rax.
362    IMUL2  _ src        -> mkRU (eax:use_R src []) [eax,edx]
363
364    MUL    _ src dst    -> usageRM src dst
365    MUL2   _ src        -> mkRU (eax:use_R src []) [eax,edx]
366    DIV    _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
367    IDIV   _ op -> mkRU (eax:edx:use_R op []) [eax,edx]
368    ADD_CC _ src dst    -> usageRM src dst
369    SUB_CC _ src dst    -> usageRM src dst
370    AND    _ src dst    -> usageRM src dst
371    OR     _ src dst    -> usageRM src dst
372
373    XOR    _ (OpReg src) (OpReg dst)
374        | src == dst    -> mkRU [] [dst]
375
376    XOR    _ src dst    -> usageRM src dst
377    NOT    _ op         -> usageM op
378    BSWAP  _ reg        -> mkRU [reg] [reg]
379    NEGI   _ op         -> usageM op
380    SHL    _ imm dst    -> usageRM imm dst
381    SAR    _ imm dst    -> usageRM imm dst
382    SHR    _ imm dst    -> usageRM imm dst
383    BT     _ _   src    -> mkRUR (use_R src [])
384
385    PUSH   _ op         -> mkRUR (use_R op [])
386    POP    _ op         -> mkRU [] (def_W op)
387    TEST   _ src dst    -> mkRUR (use_R src $! use_R dst [])
388    CMP    _ src dst    -> mkRUR (use_R src $! use_R dst [])
389    SETCC  _ op         -> mkRU [] (def_W op)
390    JXX    _ _          -> mkRU [] []
391    JXX_GBL _ _         -> mkRU [] []
392    JMP     op regs     -> mkRUR (use_R op regs)
393    JMP_TBL op _ _ _    -> mkRUR (use_R op [])
394    CALL (Left _)  params   -> mkRU params (callClobberedRegs platform)
395    CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform)
396    CLTD   _            -> mkRU [eax] [edx]
397    NOP                 -> mkRU [] []
398
399    X87Store    _  dst    -> mkRUR ( use_EA dst [])
400
401    CVTSS2SD   src dst  -> mkRU [src] [dst]
402    CVTSD2SS   src dst  -> mkRU [src] [dst]
403    CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst]
404    CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst]
405    CVTSI2SS   _ src dst -> mkRU (use_R src []) [dst]
406    CVTSI2SD   _ src dst -> mkRU (use_R src []) [dst]
407    FDIV _     src dst  -> usageRM src dst
408    SQRT _ src dst      -> mkRU (use_R src []) [dst]
409
410    FETCHGOT reg        -> mkRU [] [reg]
411    FETCHPC  reg        -> mkRU [] [reg]
412
413    COMMENT _           -> noUsage
414    LOCATION{}          -> noUsage
415    UNWIND{}            -> noUsage
416    DELTA   _           -> noUsage
417
418    POPCNT _ src dst -> mkRU (use_R src []) [dst]
419    LZCNT  _ src dst -> mkRU (use_R src []) [dst]
420    TZCNT  _ src dst -> mkRU (use_R src []) [dst]
421    BSF    _ src dst -> mkRU (use_R src []) [dst]
422    BSR    _ src dst -> mkRU (use_R src []) [dst]
423
424    PDEP   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
425    PEXT   _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
426
427    -- note: might be a better way to do this
428    PREFETCH _  _ src -> mkRU (use_R src []) []
429    LOCK i              -> x86_regUsageOfInstr platform i
430    XADD _ src dst      -> usageMM src dst
431    CMPXCHG _ src dst   -> usageRMM src dst (OpReg eax)
432    MFENCE -> noUsage
433
434    _other              -> panic "regUsage: unrecognised instr"
435 where
436    -- # Definitions
437    --
438    -- Written: If the operand is a register, it's written. If it's an
439    -- address, registers mentioned in the address are read.
440    --
441    -- Modified: If the operand is a register, it's both read and
442    -- written. If it's an address, registers mentioned in the address
443    -- are read.
444
445    -- 2 operand form; first operand Read; second Written
446    usageRW :: Operand -> Operand -> RegUsage
447    usageRW op (OpReg reg)      = mkRU (use_R op []) [reg]
448    usageRW op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
449    usageRW _ _                 = panic "X86.RegInfo.usageRW: no match"
450
451    -- 2 operand form; first operand Read; second Modified
452    usageRM :: Operand -> Operand -> RegUsage
453    usageRM op (OpReg reg)      = mkRU (use_R op [reg]) [reg]
454    usageRM op (OpAddr ea)      = mkRUR (use_R op $! use_EA ea [])
455    usageRM _ _                 = panic "X86.RegInfo.usageRM: no match"
456
457    -- 2 operand form; first operand Modified; second Modified
458    usageMM :: Operand -> Operand -> RegUsage
459    usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
460    usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
461    usageMM _ _                     = panic "X86.RegInfo.usageMM: no match"
462
463    -- 3 operand form; first operand Read; second Modified; third Modified
464    usageRMM :: Operand -> Operand -> Operand -> RegUsage
465    usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
466    usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
467    usageRMM _ _ _                               = panic "X86.RegInfo.usageRMM: no match"
468
469    -- 1 operand form; operand Modified
470    usageM :: Operand -> RegUsage
471    usageM (OpReg reg)          = mkRU [reg] [reg]
472    usageM (OpAddr ea)          = mkRUR (use_EA ea [])
473    usageM _                    = panic "X86.RegInfo.usageM: no match"
474
475    -- Registers defd when an operand is written.
476    def_W (OpReg reg)           = [reg]
477    def_W (OpAddr _ )           = []
478    def_W _                     = panic "X86.RegInfo.def_W: no match"
479
480    -- Registers used when an operand is read.
481    use_R (OpReg reg)  tl = reg : tl
482    use_R (OpImm _)    tl = tl
483    use_R (OpAddr ea)  tl = use_EA ea tl
484
485    -- Registers used to compute an effective address.
486    use_EA (ImmAddr _ _) tl = tl
487    use_EA (AddrBaseIndex base index _) tl =
488        use_base base $! use_index index tl
489        where use_base (EABaseReg r)  tl = r : tl
490              use_base _              tl = tl
491              use_index EAIndexNone   tl = tl
492              use_index (EAIndex i _) tl = i : tl
493
494    mkRUR src = src' `seq` RU src' []
495        where src' = filter (interesting platform) src
496
497    mkRU src dst = src' `seq` dst' `seq` RU src' dst'
498        where src' = filter (interesting platform) src
499              dst' = filter (interesting platform) dst
500
501-- | Is this register interesting for the register allocator?
502interesting :: Platform -> Reg -> Bool
503interesting _        (RegVirtual _)              = True
504interesting platform (RegReal (RealRegSingle i)) = freeReg platform i
505interesting _        (RegReal (RealRegPair{}))   = panic "X86.interesting: no reg pairs on this arch"
506
507
508
509-- | Applies the supplied function to all registers in instructions.
510-- Typically used to change virtual registers to real registers.
511x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
512x86_patchRegsOfInstr instr env
513 = case instr of
514    MOV  fmt src dst     -> patch2 (MOV  fmt) src dst
515    CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
516    MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
517    MOVSxL fmt src dst   -> patch2 (MOVSxL fmt) src dst
518    LEA  fmt src dst     -> patch2 (LEA  fmt) src dst
519    ADD  fmt src dst     -> patch2 (ADD  fmt) src dst
520    ADC  fmt src dst     -> patch2 (ADC  fmt) src dst
521    SUB  fmt src dst     -> patch2 (SUB  fmt) src dst
522    SBB  fmt src dst     -> patch2 (SBB  fmt) src dst
523    IMUL fmt src dst     -> patch2 (IMUL fmt) src dst
524    IMUL2 fmt src        -> patch1 (IMUL2 fmt) src
525    MUL fmt src dst      -> patch2 (MUL fmt) src dst
526    MUL2 fmt src         -> patch1 (MUL2 fmt) src
527    IDIV fmt op          -> patch1 (IDIV fmt) op
528    DIV fmt op           -> patch1 (DIV fmt) op
529    ADD_CC fmt src dst   -> patch2 (ADD_CC fmt) src dst
530    SUB_CC fmt src dst   -> patch2 (SUB_CC fmt) src dst
531    AND  fmt src dst     -> patch2 (AND  fmt) src dst
532    OR   fmt src dst     -> patch2 (OR   fmt) src dst
533    XOR  fmt src dst     -> patch2 (XOR  fmt) src dst
534    NOT  fmt op          -> patch1 (NOT  fmt) op
535    BSWAP fmt reg        -> BSWAP fmt (env reg)
536    NEGI fmt op          -> patch1 (NEGI fmt) op
537    SHL  fmt imm dst     -> patch1 (SHL fmt imm) dst
538    SAR  fmt imm dst     -> patch1 (SAR fmt imm) dst
539    SHR  fmt imm dst     -> patch1 (SHR fmt imm) dst
540    BT   fmt imm src     -> patch1 (BT  fmt imm) src
541    TEST fmt src dst     -> patch2 (TEST fmt) src dst
542    CMP  fmt src dst     -> patch2 (CMP  fmt) src dst
543    PUSH fmt op          -> patch1 (PUSH fmt) op
544    POP  fmt op          -> patch1 (POP  fmt) op
545    SETCC cond op        -> patch1 (SETCC cond) op
546    JMP op regs          -> JMP (patchOp op) regs
547    JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl
548
549    -- literally only support storing the top x87 stack value st(0)
550    X87Store  fmt  dst     -> X87Store fmt  (lookupAddr dst)
551
552    CVTSS2SD src dst    -> CVTSS2SD (env src) (env dst)
553    CVTSD2SS src dst    -> CVTSD2SS (env src) (env dst)
554    CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst)
555    CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst)
556    CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst)
557    CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst)
558    FDIV fmt src dst     -> FDIV fmt (patchOp src) (patchOp dst)
559    SQRT fmt src dst    -> SQRT fmt (patchOp src) (env dst)
560
561    CALL (Left _)  _    -> instr
562    CALL (Right reg) p  -> CALL (Right (env reg)) p
563
564    FETCHGOT reg        -> FETCHGOT (env reg)
565    FETCHPC  reg        -> FETCHPC  (env reg)
566
567    NOP                 -> instr
568    COMMENT _           -> instr
569    LOCATION {}         -> instr
570    UNWIND {}           -> instr
571    DELTA _             -> instr
572
573    JXX _ _             -> instr
574    JXX_GBL _ _         -> instr
575    CLTD _              -> instr
576
577    POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
578    LZCNT  fmt src dst -> LZCNT  fmt (patchOp src) (env dst)
579    TZCNT  fmt src dst -> TZCNT  fmt (patchOp src) (env dst)
580    PDEP   fmt src mask dst -> PDEP   fmt (patchOp src) (patchOp mask) (env dst)
581    PEXT   fmt src mask dst -> PEXT   fmt (patchOp src) (patchOp mask) (env dst)
582    BSF    fmt src dst -> BSF    fmt (patchOp src) (env dst)
583    BSR    fmt src dst -> BSR    fmt (patchOp src) (env dst)
584
585    PREFETCH lvl format src -> PREFETCH lvl format (patchOp src)
586
587    LOCK i               -> LOCK (x86_patchRegsOfInstr i env)
588    XADD fmt src dst     -> patch2 (XADD fmt) src dst
589    CMPXCHG fmt src dst  -> patch2 (CMPXCHG fmt) src dst
590    MFENCE               -> instr
591
592    _other              -> panic "patchRegs: unrecognised instr"
593
594  where
595    patch1 :: (Operand -> a) -> Operand -> a
596    patch1 insn op      = insn $! patchOp op
597    patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a
598    patch2 insn src dst = (insn $! patchOp src) $! patchOp dst
599
600    patchOp (OpReg  reg) = OpReg $! env reg
601    patchOp (OpImm  imm) = OpImm imm
602    patchOp (OpAddr ea)  = OpAddr $! lookupAddr ea
603
604    lookupAddr (ImmAddr imm off) = ImmAddr imm off
605    lookupAddr (AddrBaseIndex base index disp)
606      = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp
607      where
608        lookupBase EABaseNone       = EABaseNone
609        lookupBase EABaseRip        = EABaseRip
610        lookupBase (EABaseReg r)    = EABaseReg $! env r
611
612        lookupIndex EAIndexNone     = EAIndexNone
613        lookupIndex (EAIndex r i)   = (EAIndex $! env r) i
614
615
616--------------------------------------------------------------------------------
617x86_isJumpishInstr
618        :: Instr -> Bool
619
620x86_isJumpishInstr instr
621 = case instr of
622        JMP{}           -> True
623        JXX{}           -> True
624        JXX_GBL{}       -> True
625        JMP_TBL{}       -> True
626        CALL{}          -> True
627        _               -> False
628
629
630x86_jumpDestsOfInstr
631        :: Instr
632        -> [BlockId]
633
634x86_jumpDestsOfInstr insn
635  = case insn of
636        JXX _ id        -> [id]
637        JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids]
638        _               -> []
639
640
641x86_patchJumpInstr
642        :: Instr -> (BlockId -> BlockId) -> Instr
643
644x86_patchJumpInstr insn patchF
645  = case insn of
646        JXX cc id       -> JXX cc (patchF id)
647        JMP_TBL op ids section lbl
648          -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl
649        _               -> insn
650    where
651        patchJumpDest f (DestBlockId id) = DestBlockId (f id)
652        patchJumpDest _ dest             = dest
653
654
655
656
657
658-- -----------------------------------------------------------------------------
659-- | Make a spill instruction.
660x86_mkSpillInstr
661    :: DynFlags
662    -> Reg      -- register to spill
663    -> Int      -- current stack delta
664    -> Int      -- spill slot to use
665    -> Instr
666
667x86_mkSpillInstr dflags reg delta slot
668  = let off     = spillSlotToOffset platform slot - delta
669    in
670    case targetClassOfReg platform reg of
671           RcInteger   -> MOV (archWordFormat is32Bit)
672                              (OpReg reg) (OpAddr (spRel dflags off))
673           RcDouble    -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
674           _         -> panic "X86.mkSpillInstr: no match"
675    where platform = targetPlatform dflags
676          is32Bit = target32Bit platform
677
678-- | Make a spill reload instruction.
679x86_mkLoadInstr
680    :: DynFlags
681    -> Reg      -- register to load
682    -> Int      -- current stack delta
683    -> Int      -- spill slot to use
684    -> Instr
685
686x86_mkLoadInstr dflags reg delta slot
687  = let off     = spillSlotToOffset platform slot - delta
688    in
689        case targetClassOfReg platform reg of
690              RcInteger -> MOV (archWordFormat is32Bit)
691                               (OpAddr (spRel dflags off)) (OpReg reg)
692              RcDouble  -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
693              _           -> panic "X86.x86_mkLoadInstr"
694    where platform = targetPlatform dflags
695          is32Bit = target32Bit platform
696
697spillSlotSize :: Platform -> Int
698spillSlotSize dflags = if is32Bit then 12 else 8
699    where is32Bit = target32Bit dflags
700
701maxSpillSlots :: DynFlags -> Int
702maxSpillSlots dflags
703    = ((rESERVED_C_STACK_BYTES dflags - 64) `div` spillSlotSize (targetPlatform dflags)) - 1
704--     = 0 -- useful for testing allocMoreStack
705
706-- number of bytes that the stack pointer should be aligned to
707stackAlign :: Int
708stackAlign = 16
709
710-- convert a spill slot number to a *byte* offset, with no sign:
711-- decide on a per arch basis whether you are spilling above or below
712-- the C stack pointer.
713spillSlotToOffset :: Platform -> Int -> Int
714spillSlotToOffset platform slot
715   = 64 + spillSlotSize platform * slot
716
717--------------------------------------------------------------------------------
718
719-- | See if this instruction is telling us the current C stack delta
720x86_takeDeltaInstr
721        :: Instr
722        -> Maybe Int
723
724x86_takeDeltaInstr instr
725 = case instr of
726        DELTA i         -> Just i
727        _               -> Nothing
728
729
730x86_isMetaInstr
731        :: Instr
732        -> Bool
733
734x86_isMetaInstr instr
735 = case instr of
736        COMMENT{}       -> True
737        LOCATION{}      -> True
738        LDATA{}         -> True
739        NEWBLOCK{}      -> True
740        UNWIND{}        -> True
741        DELTA{}         -> True
742        _               -> False
743
744
745
746---  TODO: why is there
747-- | Make a reg-reg move instruction.
748--      On SPARC v8 there are no instructions to move directly between
749--      floating point and integer regs. If we need to do that then we
750--      have to go via memory.
751--
752x86_mkRegRegMoveInstr
753    :: Platform
754    -> Reg
755    -> Reg
756    -> Instr
757
758x86_mkRegRegMoveInstr platform src dst
759 = case targetClassOfReg platform src of
760        RcInteger -> case platformArch platform of
761                     ArchX86    -> MOV II32 (OpReg src) (OpReg dst)
762                     ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst)
763                     _          -> panic "x86_mkRegRegMoveInstr: Bad arch"
764        RcDouble    ->  MOV FF64 (OpReg src) (OpReg dst)
765        -- this code is the lie we tell ourselves because both float and double
766        -- use the same register class.on x86_64 and x86 32bit with SSE2,
767        -- more plainly, both use the XMM registers
768        _     -> panic "X86.RegInfo.mkRegRegMoveInstr: no match"
769
770-- | Check whether an instruction represents a reg-reg move.
771--      The register allocator attempts to eliminate reg->reg moves whenever it can,
772--      by assigning the src and dest temporaries to the same real register.
773--
774x86_takeRegRegMoveInstr
775        :: Instr
776        -> Maybe (Reg,Reg)
777
778x86_takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2))
779        = Just (r1,r2)
780
781x86_takeRegRegMoveInstr _  = Nothing
782
783
784-- | Make an unconditional branch instruction.
785x86_mkJumpInstr
786        :: BlockId
787        -> [Instr]
788
789x86_mkJumpInstr id
790        = [JXX ALWAYS id]
791
792-- Note [Windows stack layout]
793-- | On most OSes the kernel will place a guard page after the current stack
794--   page.  If you allocate larger than a page worth you may jump over this
795--   guard page.  Not only is this a security issue, but on certain OSes such
796--   as Windows a new page won't be allocated if you don't hit the guard.  This
797--   will cause a segfault or access fault.
798--
799--   This function defines if the current allocation amount requires a probe.
800--   On Windows (for now) we emit a call to _chkstk for this.  For other OSes
801--   this is not yet implemented.
802--   See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk
803--   The Windows stack looks like this:
804--
805--                         +-------------------+
806--                         |        SP         |
807--                         +-------------------+
808--                         |                   |
809--                         |    GUARD PAGE     |
810--                         |                   |
811--                         +-------------------+
812--                         |                   |
813--                         |                   |
814--                         |     UNMAPPED      |
815--                         |                   |
816--                         |                   |
817--                         +-------------------+
818--
819--   In essense each allocation larger than a page size needs to be chunked and
820--   a probe emitted after each page allocation.  You have to hit the guard
821--   page so the kernel can map in the next page, otherwise you'll segfault.
822--   See Note [Windows stack allocations].
823--
824needs_probe_call :: Platform -> Int -> Bool
825needs_probe_call platform amount
826  = case platformOS platform of
827     OSMinGW32 -> case platformArch platform of
828                    ArchX86    -> amount > (4 * 1024)
829                    ArchX86_64 -> amount > (4 * 1024)
830                    _          -> False
831     _         -> False
832
833x86_mkStackAllocInstr
834        :: Platform
835        -> Int
836        -> [Instr]
837x86_mkStackAllocInstr platform amount
838  = case platformOS platform of
839      OSMinGW32 ->
840        -- These will clobber AX but this should be ok because
841        --
842        -- 1. It is the first thing we do when entering the closure and AX is
843        --    a caller saved registers on Windows both on x86_64 and x86.
844        --
845        -- 2. The closures are only entered via a call or longjmp in which case
846        --    there are no expectations for volatile registers.
847        --
848        -- 3. When the target is a local branch point it is re-targeted
849        --    after the dealloc, preserving #2.  See note [extra spill slots].
850        --
851        -- We emit a call because the stack probes are quite involved and
852        -- would bloat code size a lot.  GHC doesn't really have an -Os.
853        -- ___chkstk is guaranteed to leave all nonvolatile registers and AX
854        -- untouched.  It's part of the standard prologue code for any Windows
855        -- function dropping the stack more than a page.
856        -- See Note [Windows stack layout]
857        case platformArch platform of
858            ArchX86    | needs_probe_call platform amount ->
859                           [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax)
860                           , CALL (Left $ strImmLit "___chkstk_ms") [eax]
861                           , SUB II32 (OpReg eax) (OpReg esp)
862                           ]
863                       | otherwise ->
864                           [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
865                           , TEST II32 (OpReg esp) (OpReg esp)
866                           ]
867            ArchX86_64 | needs_probe_call platform amount ->
868                           [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax)
869                           , CALL (Left $ strImmLit "___chkstk_ms") [rax]
870                           , SUB II64 (OpReg rax) (OpReg rsp)
871                           ]
872                       | otherwise ->
873                           [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
874                           , TEST II64 (OpReg rsp) (OpReg rsp)
875                           ]
876            _ -> panic "x86_mkStackAllocInstr"
877      _       ->
878        case platformArch platform of
879          ArchX86    -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ]
880          ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ]
881          _ -> panic "x86_mkStackAllocInstr"
882
883x86_mkStackDeallocInstr
884        :: Platform
885        -> Int
886        -> [Instr]
887x86_mkStackDeallocInstr platform amount
888  = case platformArch platform of
889      ArchX86    -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)]
890      ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)]
891      _ -> panic "x86_mkStackDeallocInstr"
892
893
894--
895-- Note [extra spill slots]
896--
897-- If the register allocator used more spill slots than we have
898-- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more
899-- C stack space on entry and exit from this proc.  Therefore we
900-- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp"
901-- before every non-local jump.
902--
903-- This became necessary when the new codegen started bundling entire
904-- functions together into one proc, because the register allocator
905-- assigns a different stack slot to each virtual reg within a proc.
906-- To avoid using so many slots we could also:
907--
908--   - split up the proc into connected components before code generator
909--
910--   - rename the virtual regs, so that we re-use vreg names and hence
911--     stack slots for non-overlapping vregs.
912--
913-- Note that when a block is both a non-local entry point (with an
914-- info table) and a local branch target, we have to split it into
915-- two, like so:
916--
917--    <info table>
918--    L:
919--       <code>
920--
921-- becomes
922--
923--    <info table>
924--    L:
925--       subl $rsp, N
926--       jmp Lnew
927--    Lnew:
928--       <code>
929--
930-- and all branches pointing to L are retargetted to point to Lnew.
931-- Otherwise, we would repeat the $rsp adjustment for each branch to
932-- L.
933--
934-- Returns a list of (L,Lnew) pairs.
935--
936allocMoreStack
937  :: Platform
938  -> Int
939  -> NatCmmDecl statics X86.Instr.Instr
940  -> UniqSM (NatCmmDecl statics X86.Instr.Instr, [(BlockId,BlockId)])
941
942allocMoreStack _ _ top@(CmmData _ _) = return (top,[])
943allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
944    let entries = entryBlocks proc
945
946    uniqs <- replicateM (length entries) getUniqueM
947
948    let
949      delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up
950        where x = slots * spillSlotSize platform -- sp delta
951
952      alloc   = mkStackAllocInstr   platform delta
953      dealloc = mkStackDeallocInstr platform delta
954
955      retargetList = (zip entries (map mkBlockId uniqs))
956
957      new_blockmap :: LabelMap BlockId
958      new_blockmap = mapFromList retargetList
959
960      insert_stack_insns (BasicBlock id insns)
961         | Just new_blockid <- mapLookup id new_blockmap
962         = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid]
963           , BasicBlock new_blockid block' ]
964         | otherwise
965         = [ BasicBlock id block' ]
966         where
967           block' = foldr insert_dealloc [] insns
968
969      insert_dealloc insn r = case insn of
970         JMP _ _     -> dealloc ++ (insn : r)
971         JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL"
972         _other      -> x86_patchJumpInstr insn retarget : r
973           where retarget b = fromMaybe b (mapLookup b new_blockmap)
974
975      new_code = concatMap insert_stack_insns code
976    -- in
977    return (CmmProc info lbl live (ListGraph new_code), retargetList)
978
979data JumpDest = DestBlockId BlockId | DestImm Imm
980
981-- Debug Instance
982instance Outputable JumpDest where
983  ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid
984  ppr (DestImm _imm)    = text "jd<imm>:noShow"
985
986
987getJumpDestBlockId :: JumpDest -> Maybe BlockId
988getJumpDestBlockId (DestBlockId bid) = Just bid
989getJumpDestBlockId _                 = Nothing
990
991canShortcut :: Instr -> Maybe JumpDest
992canShortcut (JXX ALWAYS id)      = Just (DestBlockId id)
993canShortcut (JMP (OpImm imm) _)  = Just (DestImm imm)
994canShortcut _                    = Nothing
995
996
997-- This helper shortcuts a sequence of branches.
998-- The blockset helps avoid following cycles.
999shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr
1000shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn
1001  where
1002    shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr
1003    shortcutJump' fn seen insn@(JXX cc id) =
1004        if setMember id seen then insn
1005        else case fn id of
1006            Nothing                -> insn
1007            Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id')
1008            Just (DestImm imm)     -> shortcutJump' fn seen' (JXX_GBL cc imm)
1009        where seen' = setInsert id seen
1010    shortcutJump' fn _ (JMP_TBL addr blocks section tblId) =
1011        let updateBlock (Just (DestBlockId bid))  =
1012                case fn bid of
1013                    Nothing   -> Just (DestBlockId bid )
1014                    Just dest -> Just dest
1015            updateBlock dest = dest
1016            blocks' = map updateBlock blocks
1017        in  JMP_TBL addr blocks' section tblId
1018    shortcutJump' _ _ other = other
1019
1020-- Here because it knows about JumpDest
1021shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, CmmStatics) -> (Alignment, CmmStatics)
1022shortcutStatics fn (align, Statics lbl statics)
1023  = (align, Statics lbl $ map (shortcutStatic fn) statics)
1024  -- we need to get the jump tables, so apply the mapping to the entries
1025  -- of a CmmData too.
1026
1027shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel
1028shortcutLabel fn lab
1029  | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId
1030  | otherwise                              = lab
1031
1032shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic
1033shortcutStatic fn (CmmStaticLit (CmmLabel lab))
1034  = CmmStaticLit (CmmLabel (shortcutLabel fn lab))
1035shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w))
1036  = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w)
1037        -- slightly dodgy, we're ignoring the second label, but this
1038        -- works with the way we use CmmLabelDiffOff for jump tables now.
1039shortcutStatic _ other_static
1040        = other_static
1041
1042shortBlockId
1043        :: (BlockId -> Maybe JumpDest)
1044        -> UniqSet Unique
1045        -> BlockId
1046        -> CLabel
1047
1048shortBlockId fn seen blockid =
1049  case (elementOfUniqSet uq seen, fn blockid) of
1050    (True, _)    -> blockLbl blockid
1051    (_, Nothing) -> blockLbl blockid
1052    (_, Just (DestBlockId blockid'))  -> shortBlockId fn (addOneToUniqSet seen uq) blockid'
1053    (_, Just (DestImm (ImmCLbl lbl))) -> lbl
1054    (_, _other) -> panic "shortBlockId"
1055  where uq = getUnique blockid
1056