1{-# LANGUAGE CPP, TypeFamilies #-} 2 3----------------------------------------------------------------------------- 4-- 5-- Machine-dependent assembly language 6-- 7-- (c) The University of Glasgow 1993-2004 8-- 9----------------------------------------------------------------------------- 10 11module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest(..), 12 getJumpDestBlockId, canShortcut, shortcutStatics, 13 shortcutJump, allocMoreStack, 14 maxSpillSlots, archWordFormat ) 15where 16 17#include "HsVersions.h" 18 19import GhcPrelude 20 21import X86.Cond 22import X86.Regs 23import Instruction 24import Format 25import RegClass 26import Reg 27import TargetReg 28 29import BlockId 30import Hoopl.Collections 31import Hoopl.Label 32import GHC.Platform.Regs 33import Cmm 34import FastString 35import Outputable 36import GHC.Platform 37 38import BasicTypes (Alignment) 39import CLabel 40import DynFlags 41import UniqSet 42import Unique 43import UniqSupply 44import Debug (UnwindTable) 45 46import Control.Monad 47import Data.Maybe (fromMaybe) 48 49-- Format of an x86/x86_64 memory address, in bytes. 50-- 51archWordFormat :: Bool -> Format 52archWordFormat is32Bit 53 | is32Bit = II32 54 | otherwise = II64 55 56-- | Instruction instance for x86 instruction set. 57instance Instruction Instr where 58 regUsageOfInstr = x86_regUsageOfInstr 59 patchRegsOfInstr = x86_patchRegsOfInstr 60 isJumpishInstr = x86_isJumpishInstr 61 jumpDestsOfInstr = x86_jumpDestsOfInstr 62 patchJumpInstr = x86_patchJumpInstr 63 mkSpillInstr = x86_mkSpillInstr 64 mkLoadInstr = x86_mkLoadInstr 65 takeDeltaInstr = x86_takeDeltaInstr 66 isMetaInstr = x86_isMetaInstr 67 mkRegRegMoveInstr = x86_mkRegRegMoveInstr 68 takeRegRegMoveInstr = x86_takeRegRegMoveInstr 69 mkJumpInstr = x86_mkJumpInstr 70 mkStackAllocInstr = x86_mkStackAllocInstr 71 mkStackDeallocInstr = x86_mkStackDeallocInstr 72 73 74-- ----------------------------------------------------------------------------- 75-- Intel x86 instructions 76 77{- 78Intel, in their infinite wisdom, selected a stack model for floating 79point registers on x86. That might have made sense back in 1979 -- 80nowadays we can see it for the nonsense it really is. A stack model 81fits poorly with the existing nativeGen infrastructure, which assumes 82flat integer and FP register sets. Prior to this commit, nativeGen 83could not generate correct x86 FP code -- to do so would have meant 84somehow working the register-stack paradigm into the register 85allocator and spiller, which sounds very difficult. 86 87We have decided to cheat, and go for a simple fix which requires no 88infrastructure modifications, at the expense of generating ropey but 89correct FP code. All notions of the x86 FP stack and its insns have 90been removed. Instead, we pretend (to the instruction selector and 91register allocator) that x86 has six floating point registers, %fake0 92.. %fake5, which can be used in the usual flat manner. We further 93claim that x86 has floating point instructions very similar to SPARC 94and Alpha, that is, a simple 3-operand register-register arrangement. 95Code generation and register allocation proceed on this basis. 96 97When we come to print out the final assembly, our convenient fiction 98is converted to dismal reality. Each fake instruction is 99independently converted to a series of real x86 instructions. 100%fake0 .. %fake5 are mapped to %st(0) .. %st(5). To do reg-reg 101arithmetic operations, the two operands are pushed onto the top of the 102FP stack, the operation done, and the result copied back into the 103relevant register. There are only six %fake registers because 2 are 104needed for the translation, and x86 has 8 in total. 105 106The translation is inefficient but is simple and it works. A cleverer 107translation would handle a sequence of insns, simulating the FP stack 108contents, would not impose a fixed mapping from %fake to %st regs, and 109hopefully could avoid most of the redundant reg-reg moves of the 110current translation. 111 112We might as well make use of whatever unique FP facilities Intel have 113chosen to bless us with (let's not be churlish, after all). 114Hence GLDZ and GLD1. Bwahahahahahahaha! 115-} 116 117{- 118Note [x86 Floating point precision] 119 120Intel's internal floating point registers are by default 80 bit 121extended precision. This means that all operations done on values in 122registers are done at 80 bits, and unless the intermediate values are 123truncated to the appropriate size (32 or 64 bits) by storing in 124memory, calculations in registers will give different results from 125calculations which pass intermediate values in memory (eg. via 126function calls). 127 128One solution is to set the FPU into 64 bit precision mode. Some OSs 129do this (eg. FreeBSD) and some don't (eg. Linux). The problem here is 130that this will only affect 64-bit precision arithmetic; 32-bit 131calculations will still be done at 64-bit precision in registers. So 132it doesn't solve the whole problem. 133 134There's also the issue of what the C library is expecting in terms of 135precision. It seems to be the case that glibc on Linux expects the 136FPU to be set to 80 bit precision, so setting it to 64 bit could have 137unexpected effects. Changing the default could have undesirable 138effects on other 3rd-party library code too, so the right thing would 139be to save/restore the FPU control word across Haskell code if we were 140to do this. 141 142gcc's -ffloat-store gives consistent results by always storing the 143results of floating-point calculations in memory, which works for both 14432 and 64-bit precision. However, it only affects the values of 145user-declared floating point variables in C, not intermediate results. 146GHC in -fvia-C mode uses -ffloat-store (see the -fexcess-precision 147flag). 148 149Another problem is how to spill floating point registers in the 150register allocator. Should we spill the whole 80 bits, or just 64? 151On an OS which is set to 64 bit precision, spilling 64 is fine. On 152Linux, spilling 64 bits will round the results of some operations. 153This is what gcc does. Spilling at 80 bits requires taking up a full 154128 bit slot (so we get alignment). We spill at 80-bits and ignore 155the alignment problems. 156 157In the future [edit: now available in GHC 7.0.1, with the -msse2 158flag], we'll use the SSE registers for floating point. This requires 159a CPU that supports SSE2 (ordinary SSE only supports 32 bit precision 160float ops), which means P4 or Xeon and above. Using SSE will solve 161all these problems, because the SSE registers use fixed 32 bit or 64 162bit precision. 163 164--SDM 1/2003 165-} 166 167data Instr 168 -- comment pseudo-op 169 = COMMENT FastString 170 171 -- location pseudo-op (file, line, col, name) 172 | LOCATION Int Int Int String 173 174 -- some static data spat out during code 175 -- generation. Will be extracted before 176 -- pretty-printing. 177 | LDATA Section (Alignment, CmmStatics) 178 179 -- start a new basic block. Useful during 180 -- codegen, removed later. Preceding 181 -- instruction should be a jump, as per the 182 -- invariants for a BasicBlock (see Cmm). 183 | NEWBLOCK BlockId 184 185 -- unwinding information 186 -- See Note [Unwinding information in the NCG]. 187 | UNWIND CLabel UnwindTable 188 189 -- specify current stack offset for benefit of subsequent passes. 190 -- This carries a BlockId so it can be used in unwinding information. 191 | DELTA Int 192 193 -- Moves. 194 | MOV Format Operand Operand 195 | CMOV Cond Format Operand Reg 196 | MOVZxL Format Operand Operand -- format is the size of operand 1 197 | MOVSxL Format Operand Operand -- format is the size of operand 1 198 -- x86_64 note: plain mov into a 32-bit register always zero-extends 199 -- into the 64-bit reg, in contrast to the 8 and 16-bit movs which 200 -- don't affect the high bits of the register. 201 202 -- Load effective address (also a very useful three-operand add instruction :-) 203 | LEA Format Operand Operand 204 205 -- Int Arithmetic. 206 | ADD Format Operand Operand 207 | ADC Format Operand Operand 208 | SUB Format Operand Operand 209 | SBB Format Operand Operand 210 211 | MUL Format Operand Operand 212 | MUL2 Format Operand -- %edx:%eax = operand * %rax 213 | IMUL Format Operand Operand -- signed int mul 214 | IMUL2 Format Operand -- %edx:%eax = operand * %eax 215 216 | DIV Format Operand -- eax := eax:edx/op, edx := eax:edx%op 217 | IDIV Format Operand -- ditto, but signed 218 219 -- Int Arithmetic, where the effects on the condition register 220 -- are important. Used in specialized sequences such as MO_Add2. 221 -- Do not rewrite these instructions to "equivalent" ones that 222 -- have different effect on the condition register! (See #9013.) 223 | ADD_CC Format Operand Operand 224 | SUB_CC Format Operand Operand 225 226 -- Simple bit-twiddling. 227 | AND Format Operand Operand 228 | OR Format Operand Operand 229 | XOR Format Operand Operand 230 | NOT Format Operand 231 | NEGI Format Operand -- NEG instruction (name clash with Cond) 232 | BSWAP Format Reg 233 234 -- Shifts (amount may be immediate or %cl only) 235 | SHL Format Operand{-amount-} Operand 236 | SAR Format Operand{-amount-} Operand 237 | SHR Format Operand{-amount-} Operand 238 239 | BT Format Imm Operand 240 | NOP 241 242 243 -- We need to support the FSTP (x87 store and pop) instruction 244 -- so that we can correctly read off the return value of an 245 -- x86 CDECL C function call when its floating point. 246 -- so we dont include a register argument, and just use st(0) 247 -- this instruction is used ONLY for return values of C ffi calls 248 -- in x86_32 abi 249 | X87Store Format AddrMode -- st(0), dst 250 251 252 -- SSE2 floating point: we use a restricted set of the available SSE2 253 -- instructions for floating-point. 254 -- use MOV for moving (either movss or movsd (movlpd better?)) 255 | CVTSS2SD Reg Reg -- F32 to F64 256 | CVTSD2SS Reg Reg -- F64 to F32 257 | CVTTSS2SIQ Format Operand Reg -- F32 to I32/I64 (with truncation) 258 | CVTTSD2SIQ Format Operand Reg -- F64 to I32/I64 (with truncation) 259 | CVTSI2SS Format Operand Reg -- I32/I64 to F32 260 | CVTSI2SD Format Operand Reg -- I32/I64 to F64 261 262 -- use ADD, SUB, and SQRT for arithmetic. In both cases, operands 263 -- are Operand Reg. 264 265 -- SSE2 floating-point division: 266 | FDIV Format Operand Operand -- divisor, dividend(dst) 267 268 -- use CMP for comparisons. ucomiss and ucomisd instructions 269 -- compare single/double prec floating point respectively. 270 271 | SQRT Format Operand Reg -- src, dst 272 273 274 -- Comparison 275 | TEST Format Operand Operand 276 | CMP Format Operand Operand 277 | SETCC Cond Operand 278 279 -- Stack Operations. 280 | PUSH Format Operand 281 | POP Format Operand 282 -- both unused (SDM): 283 -- | PUSHA 284 -- | POPA 285 286 -- Jumping around. 287 | JMP Operand [Reg] -- including live Regs at the call 288 | JXX Cond BlockId -- includes unconditional branches 289 | JXX_GBL Cond Imm -- non-local version of JXX 290 -- Table jump 291 | JMP_TBL Operand -- Address to jump to 292 [Maybe JumpDest] -- Targets of the jump table 293 Section -- Data section jump table should be put in 294 CLabel -- Label of jump table 295 -- | X86 call instruction 296 | CALL (Either Imm Reg) -- ^ Jump target 297 [Reg] -- ^ Arguments (required for register allocation) 298 299 -- Other things. 300 | CLTD Format -- sign extend %eax into %edx:%eax 301 302 | FETCHGOT Reg -- pseudo-insn for ELF position-independent code 303 -- pretty-prints as 304 -- call 1f 305 -- 1: popl %reg 306 -- addl __GLOBAL_OFFSET_TABLE__+.-1b, %reg 307 | FETCHPC Reg -- pseudo-insn for Darwin position-independent code 308 -- pretty-prints as 309 -- call 1f 310 -- 1: popl %reg 311 312 -- bit counting instructions 313 | POPCNT Format Operand Reg -- [SSE4.2] count number of bits set to 1 314 | LZCNT Format Operand Reg -- [BMI2] count number of leading zeros 315 | TZCNT Format Operand Reg -- [BMI2] count number of trailing zeros 316 | BSF Format Operand Reg -- bit scan forward 317 | BSR Format Operand Reg -- bit scan reverse 318 319 -- bit manipulation instructions 320 | PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask 321 | PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask 322 323 -- prefetch 324 | PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch 325 -- variant can be NTA, Lvl0, Lvl1, or Lvl2 326 327 | LOCK Instr -- lock prefix 328 | XADD Format Operand Operand -- src (r), dst (r/m) 329 | CMPXCHG Format Operand Operand -- src (r), dst (r/m), eax implicit 330 | MFENCE 331 332data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2 333 334 335data Operand 336 = OpReg Reg -- register 337 | OpImm Imm -- immediate value 338 | OpAddr AddrMode -- memory reference 339 340 341 342-- | Returns which registers are read and written as a (read, written) 343-- pair. 344x86_regUsageOfInstr :: Platform -> Instr -> RegUsage 345x86_regUsageOfInstr platform instr 346 = case instr of 347 MOV _ src dst -> usageRW src dst 348 CMOV _ _ src dst -> mkRU (use_R src [dst]) [dst] 349 MOVZxL _ src dst -> usageRW src dst 350 MOVSxL _ src dst -> usageRW src dst 351 LEA _ src dst -> usageRW src dst 352 ADD _ src dst -> usageRM src dst 353 ADC _ src dst -> usageRM src dst 354 SUB _ src dst -> usageRM src dst 355 SBB _ src dst -> usageRM src dst 356 IMUL _ src dst -> usageRM src dst 357 358 -- Result of IMULB will be in just in %ax 359 IMUL2 II8 src -> mkRU (eax:use_R src []) [eax] 360 -- Result of IMUL for wider values, will be split between %dx/%edx/%rdx and 361 -- %ax/%eax/%rax. 362 IMUL2 _ src -> mkRU (eax:use_R src []) [eax,edx] 363 364 MUL _ src dst -> usageRM src dst 365 MUL2 _ src -> mkRU (eax:use_R src []) [eax,edx] 366 DIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx] 367 IDIV _ op -> mkRU (eax:edx:use_R op []) [eax,edx] 368 ADD_CC _ src dst -> usageRM src dst 369 SUB_CC _ src dst -> usageRM src dst 370 AND _ src dst -> usageRM src dst 371 OR _ src dst -> usageRM src dst 372 373 XOR _ (OpReg src) (OpReg dst) 374 | src == dst -> mkRU [] [dst] 375 376 XOR _ src dst -> usageRM src dst 377 NOT _ op -> usageM op 378 BSWAP _ reg -> mkRU [reg] [reg] 379 NEGI _ op -> usageM op 380 SHL _ imm dst -> usageRM imm dst 381 SAR _ imm dst -> usageRM imm dst 382 SHR _ imm dst -> usageRM imm dst 383 BT _ _ src -> mkRUR (use_R src []) 384 385 PUSH _ op -> mkRUR (use_R op []) 386 POP _ op -> mkRU [] (def_W op) 387 TEST _ src dst -> mkRUR (use_R src $! use_R dst []) 388 CMP _ src dst -> mkRUR (use_R src $! use_R dst []) 389 SETCC _ op -> mkRU [] (def_W op) 390 JXX _ _ -> mkRU [] [] 391 JXX_GBL _ _ -> mkRU [] [] 392 JMP op regs -> mkRUR (use_R op regs) 393 JMP_TBL op _ _ _ -> mkRUR (use_R op []) 394 CALL (Left _) params -> mkRU params (callClobberedRegs platform) 395 CALL (Right reg) params -> mkRU (reg:params) (callClobberedRegs platform) 396 CLTD _ -> mkRU [eax] [edx] 397 NOP -> mkRU [] [] 398 399 X87Store _ dst -> mkRUR ( use_EA dst []) 400 401 CVTSS2SD src dst -> mkRU [src] [dst] 402 CVTSD2SS src dst -> mkRU [src] [dst] 403 CVTTSS2SIQ _ src dst -> mkRU (use_R src []) [dst] 404 CVTTSD2SIQ _ src dst -> mkRU (use_R src []) [dst] 405 CVTSI2SS _ src dst -> mkRU (use_R src []) [dst] 406 CVTSI2SD _ src dst -> mkRU (use_R src []) [dst] 407 FDIV _ src dst -> usageRM src dst 408 SQRT _ src dst -> mkRU (use_R src []) [dst] 409 410 FETCHGOT reg -> mkRU [] [reg] 411 FETCHPC reg -> mkRU [] [reg] 412 413 COMMENT _ -> noUsage 414 LOCATION{} -> noUsage 415 UNWIND{} -> noUsage 416 DELTA _ -> noUsage 417 418 POPCNT _ src dst -> mkRU (use_R src []) [dst] 419 LZCNT _ src dst -> mkRU (use_R src []) [dst] 420 TZCNT _ src dst -> mkRU (use_R src []) [dst] 421 BSF _ src dst -> mkRU (use_R src []) [dst] 422 BSR _ src dst -> mkRU (use_R src []) [dst] 423 424 PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] 425 PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] 426 427 -- note: might be a better way to do this 428 PREFETCH _ _ src -> mkRU (use_R src []) [] 429 LOCK i -> x86_regUsageOfInstr platform i 430 XADD _ src dst -> usageMM src dst 431 CMPXCHG _ src dst -> usageRMM src dst (OpReg eax) 432 MFENCE -> noUsage 433 434 _other -> panic "regUsage: unrecognised instr" 435 where 436 -- # Definitions 437 -- 438 -- Written: If the operand is a register, it's written. If it's an 439 -- address, registers mentioned in the address are read. 440 -- 441 -- Modified: If the operand is a register, it's both read and 442 -- written. If it's an address, registers mentioned in the address 443 -- are read. 444 445 -- 2 operand form; first operand Read; second Written 446 usageRW :: Operand -> Operand -> RegUsage 447 usageRW op (OpReg reg) = mkRU (use_R op []) [reg] 448 usageRW op (OpAddr ea) = mkRUR (use_R op $! use_EA ea []) 449 usageRW _ _ = panic "X86.RegInfo.usageRW: no match" 450 451 -- 2 operand form; first operand Read; second Modified 452 usageRM :: Operand -> Operand -> RegUsage 453 usageRM op (OpReg reg) = mkRU (use_R op [reg]) [reg] 454 usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea []) 455 usageRM _ _ = panic "X86.RegInfo.usageRM: no match" 456 457 -- 2 operand form; first operand Modified; second Modified 458 usageMM :: Operand -> Operand -> RegUsage 459 usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst] 460 usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src] 461 usageMM _ _ = panic "X86.RegInfo.usageMM: no match" 462 463 -- 3 operand form; first operand Read; second Modified; third Modified 464 usageRMM :: Operand -> Operand -> Operand -> RegUsage 465 usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg] 466 usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg] 467 usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match" 468 469 -- 1 operand form; operand Modified 470 usageM :: Operand -> RegUsage 471 usageM (OpReg reg) = mkRU [reg] [reg] 472 usageM (OpAddr ea) = mkRUR (use_EA ea []) 473 usageM _ = panic "X86.RegInfo.usageM: no match" 474 475 -- Registers defd when an operand is written. 476 def_W (OpReg reg) = [reg] 477 def_W (OpAddr _ ) = [] 478 def_W _ = panic "X86.RegInfo.def_W: no match" 479 480 -- Registers used when an operand is read. 481 use_R (OpReg reg) tl = reg : tl 482 use_R (OpImm _) tl = tl 483 use_R (OpAddr ea) tl = use_EA ea tl 484 485 -- Registers used to compute an effective address. 486 use_EA (ImmAddr _ _) tl = tl 487 use_EA (AddrBaseIndex base index _) tl = 488 use_base base $! use_index index tl 489 where use_base (EABaseReg r) tl = r : tl 490 use_base _ tl = tl 491 use_index EAIndexNone tl = tl 492 use_index (EAIndex i _) tl = i : tl 493 494 mkRUR src = src' `seq` RU src' [] 495 where src' = filter (interesting platform) src 496 497 mkRU src dst = src' `seq` dst' `seq` RU src' dst' 498 where src' = filter (interesting platform) src 499 dst' = filter (interesting platform) dst 500 501-- | Is this register interesting for the register allocator? 502interesting :: Platform -> Reg -> Bool 503interesting _ (RegVirtual _) = True 504interesting platform (RegReal (RealRegSingle i)) = freeReg platform i 505interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no reg pairs on this arch" 506 507 508 509-- | Applies the supplied function to all registers in instructions. 510-- Typically used to change virtual registers to real registers. 511x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr 512x86_patchRegsOfInstr instr env 513 = case instr of 514 MOV fmt src dst -> patch2 (MOV fmt) src dst 515 CMOV cc fmt src dst -> CMOV cc fmt (patchOp src) (env dst) 516 MOVZxL fmt src dst -> patch2 (MOVZxL fmt) src dst 517 MOVSxL fmt src dst -> patch2 (MOVSxL fmt) src dst 518 LEA fmt src dst -> patch2 (LEA fmt) src dst 519 ADD fmt src dst -> patch2 (ADD fmt) src dst 520 ADC fmt src dst -> patch2 (ADC fmt) src dst 521 SUB fmt src dst -> patch2 (SUB fmt) src dst 522 SBB fmt src dst -> patch2 (SBB fmt) src dst 523 IMUL fmt src dst -> patch2 (IMUL fmt) src dst 524 IMUL2 fmt src -> patch1 (IMUL2 fmt) src 525 MUL fmt src dst -> patch2 (MUL fmt) src dst 526 MUL2 fmt src -> patch1 (MUL2 fmt) src 527 IDIV fmt op -> patch1 (IDIV fmt) op 528 DIV fmt op -> patch1 (DIV fmt) op 529 ADD_CC fmt src dst -> patch2 (ADD_CC fmt) src dst 530 SUB_CC fmt src dst -> patch2 (SUB_CC fmt) src dst 531 AND fmt src dst -> patch2 (AND fmt) src dst 532 OR fmt src dst -> patch2 (OR fmt) src dst 533 XOR fmt src dst -> patch2 (XOR fmt) src dst 534 NOT fmt op -> patch1 (NOT fmt) op 535 BSWAP fmt reg -> BSWAP fmt (env reg) 536 NEGI fmt op -> patch1 (NEGI fmt) op 537 SHL fmt imm dst -> patch1 (SHL fmt imm) dst 538 SAR fmt imm dst -> patch1 (SAR fmt imm) dst 539 SHR fmt imm dst -> patch1 (SHR fmt imm) dst 540 BT fmt imm src -> patch1 (BT fmt imm) src 541 TEST fmt src dst -> patch2 (TEST fmt) src dst 542 CMP fmt src dst -> patch2 (CMP fmt) src dst 543 PUSH fmt op -> patch1 (PUSH fmt) op 544 POP fmt op -> patch1 (POP fmt) op 545 SETCC cond op -> patch1 (SETCC cond) op 546 JMP op regs -> JMP (patchOp op) regs 547 JMP_TBL op ids s lbl -> JMP_TBL (patchOp op) ids s lbl 548 549 -- literally only support storing the top x87 stack value st(0) 550 X87Store fmt dst -> X87Store fmt (lookupAddr dst) 551 552 CVTSS2SD src dst -> CVTSS2SD (env src) (env dst) 553 CVTSD2SS src dst -> CVTSD2SS (env src) (env dst) 554 CVTTSS2SIQ fmt src dst -> CVTTSS2SIQ fmt (patchOp src) (env dst) 555 CVTTSD2SIQ fmt src dst -> CVTTSD2SIQ fmt (patchOp src) (env dst) 556 CVTSI2SS fmt src dst -> CVTSI2SS fmt (patchOp src) (env dst) 557 CVTSI2SD fmt src dst -> CVTSI2SD fmt (patchOp src) (env dst) 558 FDIV fmt src dst -> FDIV fmt (patchOp src) (patchOp dst) 559 SQRT fmt src dst -> SQRT fmt (patchOp src) (env dst) 560 561 CALL (Left _) _ -> instr 562 CALL (Right reg) p -> CALL (Right (env reg)) p 563 564 FETCHGOT reg -> FETCHGOT (env reg) 565 FETCHPC reg -> FETCHPC (env reg) 566 567 NOP -> instr 568 COMMENT _ -> instr 569 LOCATION {} -> instr 570 UNWIND {} -> instr 571 DELTA _ -> instr 572 573 JXX _ _ -> instr 574 JXX_GBL _ _ -> instr 575 CLTD _ -> instr 576 577 POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst) 578 LZCNT fmt src dst -> LZCNT fmt (patchOp src) (env dst) 579 TZCNT fmt src dst -> TZCNT fmt (patchOp src) (env dst) 580 PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst) 581 PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst) 582 BSF fmt src dst -> BSF fmt (patchOp src) (env dst) 583 BSR fmt src dst -> BSR fmt (patchOp src) (env dst) 584 585 PREFETCH lvl format src -> PREFETCH lvl format (patchOp src) 586 587 LOCK i -> LOCK (x86_patchRegsOfInstr i env) 588 XADD fmt src dst -> patch2 (XADD fmt) src dst 589 CMPXCHG fmt src dst -> patch2 (CMPXCHG fmt) src dst 590 MFENCE -> instr 591 592 _other -> panic "patchRegs: unrecognised instr" 593 594 where 595 patch1 :: (Operand -> a) -> Operand -> a 596 patch1 insn op = insn $! patchOp op 597 patch2 :: (Operand -> Operand -> a) -> Operand -> Operand -> a 598 patch2 insn src dst = (insn $! patchOp src) $! patchOp dst 599 600 patchOp (OpReg reg) = OpReg $! env reg 601 patchOp (OpImm imm) = OpImm imm 602 patchOp (OpAddr ea) = OpAddr $! lookupAddr ea 603 604 lookupAddr (ImmAddr imm off) = ImmAddr imm off 605 lookupAddr (AddrBaseIndex base index disp) 606 = ((AddrBaseIndex $! lookupBase base) $! lookupIndex index) disp 607 where 608 lookupBase EABaseNone = EABaseNone 609 lookupBase EABaseRip = EABaseRip 610 lookupBase (EABaseReg r) = EABaseReg $! env r 611 612 lookupIndex EAIndexNone = EAIndexNone 613 lookupIndex (EAIndex r i) = (EAIndex $! env r) i 614 615 616-------------------------------------------------------------------------------- 617x86_isJumpishInstr 618 :: Instr -> Bool 619 620x86_isJumpishInstr instr 621 = case instr of 622 JMP{} -> True 623 JXX{} -> True 624 JXX_GBL{} -> True 625 JMP_TBL{} -> True 626 CALL{} -> True 627 _ -> False 628 629 630x86_jumpDestsOfInstr 631 :: Instr 632 -> [BlockId] 633 634x86_jumpDestsOfInstr insn 635 = case insn of 636 JXX _ id -> [id] 637 JMP_TBL _ ids _ _ -> [id | Just (DestBlockId id) <- ids] 638 _ -> [] 639 640 641x86_patchJumpInstr 642 :: Instr -> (BlockId -> BlockId) -> Instr 643 644x86_patchJumpInstr insn patchF 645 = case insn of 646 JXX cc id -> JXX cc (patchF id) 647 JMP_TBL op ids section lbl 648 -> JMP_TBL op (map (fmap (patchJumpDest patchF)) ids) section lbl 649 _ -> insn 650 where 651 patchJumpDest f (DestBlockId id) = DestBlockId (f id) 652 patchJumpDest _ dest = dest 653 654 655 656 657 658-- ----------------------------------------------------------------------------- 659-- | Make a spill instruction. 660x86_mkSpillInstr 661 :: DynFlags 662 -> Reg -- register to spill 663 -> Int -- current stack delta 664 -> Int -- spill slot to use 665 -> Instr 666 667x86_mkSpillInstr dflags reg delta slot 668 = let off = spillSlotToOffset platform slot - delta 669 in 670 case targetClassOfReg platform reg of 671 RcInteger -> MOV (archWordFormat is32Bit) 672 (OpReg reg) (OpAddr (spRel dflags off)) 673 RcDouble -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off)) 674 _ -> panic "X86.mkSpillInstr: no match" 675 where platform = targetPlatform dflags 676 is32Bit = target32Bit platform 677 678-- | Make a spill reload instruction. 679x86_mkLoadInstr 680 :: DynFlags 681 -> Reg -- register to load 682 -> Int -- current stack delta 683 -> Int -- spill slot to use 684 -> Instr 685 686x86_mkLoadInstr dflags reg delta slot 687 = let off = spillSlotToOffset platform slot - delta 688 in 689 case targetClassOfReg platform reg of 690 RcInteger -> MOV (archWordFormat is32Bit) 691 (OpAddr (spRel dflags off)) (OpReg reg) 692 RcDouble -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg) 693 _ -> panic "X86.x86_mkLoadInstr" 694 where platform = targetPlatform dflags 695 is32Bit = target32Bit platform 696 697spillSlotSize :: Platform -> Int 698spillSlotSize dflags = if is32Bit then 12 else 8 699 where is32Bit = target32Bit dflags 700 701maxSpillSlots :: DynFlags -> Int 702maxSpillSlots dflags 703 = ((rESERVED_C_STACK_BYTES dflags - 64) `div` spillSlotSize (targetPlatform dflags)) - 1 704-- = 0 -- useful for testing allocMoreStack 705 706-- number of bytes that the stack pointer should be aligned to 707stackAlign :: Int 708stackAlign = 16 709 710-- convert a spill slot number to a *byte* offset, with no sign: 711-- decide on a per arch basis whether you are spilling above or below 712-- the C stack pointer. 713spillSlotToOffset :: Platform -> Int -> Int 714spillSlotToOffset platform slot 715 = 64 + spillSlotSize platform * slot 716 717-------------------------------------------------------------------------------- 718 719-- | See if this instruction is telling us the current C stack delta 720x86_takeDeltaInstr 721 :: Instr 722 -> Maybe Int 723 724x86_takeDeltaInstr instr 725 = case instr of 726 DELTA i -> Just i 727 _ -> Nothing 728 729 730x86_isMetaInstr 731 :: Instr 732 -> Bool 733 734x86_isMetaInstr instr 735 = case instr of 736 COMMENT{} -> True 737 LOCATION{} -> True 738 LDATA{} -> True 739 NEWBLOCK{} -> True 740 UNWIND{} -> True 741 DELTA{} -> True 742 _ -> False 743 744 745 746--- TODO: why is there 747-- | Make a reg-reg move instruction. 748-- On SPARC v8 there are no instructions to move directly between 749-- floating point and integer regs. If we need to do that then we 750-- have to go via memory. 751-- 752x86_mkRegRegMoveInstr 753 :: Platform 754 -> Reg 755 -> Reg 756 -> Instr 757 758x86_mkRegRegMoveInstr platform src dst 759 = case targetClassOfReg platform src of 760 RcInteger -> case platformArch platform of 761 ArchX86 -> MOV II32 (OpReg src) (OpReg dst) 762 ArchX86_64 -> MOV II64 (OpReg src) (OpReg dst) 763 _ -> panic "x86_mkRegRegMoveInstr: Bad arch" 764 RcDouble -> MOV FF64 (OpReg src) (OpReg dst) 765 -- this code is the lie we tell ourselves because both float and double 766 -- use the same register class.on x86_64 and x86 32bit with SSE2, 767 -- more plainly, both use the XMM registers 768 _ -> panic "X86.RegInfo.mkRegRegMoveInstr: no match" 769 770-- | Check whether an instruction represents a reg-reg move. 771-- The register allocator attempts to eliminate reg->reg moves whenever it can, 772-- by assigning the src and dest temporaries to the same real register. 773-- 774x86_takeRegRegMoveInstr 775 :: Instr 776 -> Maybe (Reg,Reg) 777 778x86_takeRegRegMoveInstr (MOV _ (OpReg r1) (OpReg r2)) 779 = Just (r1,r2) 780 781x86_takeRegRegMoveInstr _ = Nothing 782 783 784-- | Make an unconditional branch instruction. 785x86_mkJumpInstr 786 :: BlockId 787 -> [Instr] 788 789x86_mkJumpInstr id 790 = [JXX ALWAYS id] 791 792-- Note [Windows stack layout] 793-- | On most OSes the kernel will place a guard page after the current stack 794-- page. If you allocate larger than a page worth you may jump over this 795-- guard page. Not only is this a security issue, but on certain OSes such 796-- as Windows a new page won't be allocated if you don't hit the guard. This 797-- will cause a segfault or access fault. 798-- 799-- This function defines if the current allocation amount requires a probe. 800-- On Windows (for now) we emit a call to _chkstk for this. For other OSes 801-- this is not yet implemented. 802-- See https://docs.microsoft.com/en-us/windows/desktop/DevNotes/-win32-chkstk 803-- The Windows stack looks like this: 804-- 805-- +-------------------+ 806-- | SP | 807-- +-------------------+ 808-- | | 809-- | GUARD PAGE | 810-- | | 811-- +-------------------+ 812-- | | 813-- | | 814-- | UNMAPPED | 815-- | | 816-- | | 817-- +-------------------+ 818-- 819-- In essense each allocation larger than a page size needs to be chunked and 820-- a probe emitted after each page allocation. You have to hit the guard 821-- page so the kernel can map in the next page, otherwise you'll segfault. 822-- See Note [Windows stack allocations]. 823-- 824needs_probe_call :: Platform -> Int -> Bool 825needs_probe_call platform amount 826 = case platformOS platform of 827 OSMinGW32 -> case platformArch platform of 828 ArchX86 -> amount > (4 * 1024) 829 ArchX86_64 -> amount > (4 * 1024) 830 _ -> False 831 _ -> False 832 833x86_mkStackAllocInstr 834 :: Platform 835 -> Int 836 -> [Instr] 837x86_mkStackAllocInstr platform amount 838 = case platformOS platform of 839 OSMinGW32 -> 840 -- These will clobber AX but this should be ok because 841 -- 842 -- 1. It is the first thing we do when entering the closure and AX is 843 -- a caller saved registers on Windows both on x86_64 and x86. 844 -- 845 -- 2. The closures are only entered via a call or longjmp in which case 846 -- there are no expectations for volatile registers. 847 -- 848 -- 3. When the target is a local branch point it is re-targeted 849 -- after the dealloc, preserving #2. See note [extra spill slots]. 850 -- 851 -- We emit a call because the stack probes are quite involved and 852 -- would bloat code size a lot. GHC doesn't really have an -Os. 853 -- ___chkstk is guaranteed to leave all nonvolatile registers and AX 854 -- untouched. It's part of the standard prologue code for any Windows 855 -- function dropping the stack more than a page. 856 -- See Note [Windows stack layout] 857 case platformArch platform of 858 ArchX86 | needs_probe_call platform amount -> 859 [ MOV II32 (OpImm (ImmInt amount)) (OpReg eax) 860 , CALL (Left $ strImmLit "___chkstk_ms") [eax] 861 , SUB II32 (OpReg eax) (OpReg esp) 862 ] 863 | otherwise -> 864 [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) 865 , TEST II32 (OpReg esp) (OpReg esp) 866 ] 867 ArchX86_64 | needs_probe_call platform amount -> 868 [ MOV II64 (OpImm (ImmInt amount)) (OpReg rax) 869 , CALL (Left $ strImmLit "___chkstk_ms") [rax] 870 , SUB II64 (OpReg rax) (OpReg rsp) 871 ] 872 | otherwise -> 873 [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) 874 , TEST II64 (OpReg rsp) (OpReg rsp) 875 ] 876 _ -> panic "x86_mkStackAllocInstr" 877 _ -> 878 case platformArch platform of 879 ArchX86 -> [ SUB II32 (OpImm (ImmInt amount)) (OpReg esp) ] 880 ArchX86_64 -> [ SUB II64 (OpImm (ImmInt amount)) (OpReg rsp) ] 881 _ -> panic "x86_mkStackAllocInstr" 882 883x86_mkStackDeallocInstr 884 :: Platform 885 -> Int 886 -> [Instr] 887x86_mkStackDeallocInstr platform amount 888 = case platformArch platform of 889 ArchX86 -> [ADD II32 (OpImm (ImmInt amount)) (OpReg esp)] 890 ArchX86_64 -> [ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)] 891 _ -> panic "x86_mkStackDeallocInstr" 892 893 894-- 895-- Note [extra spill slots] 896-- 897-- If the register allocator used more spill slots than we have 898-- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more 899-- C stack space on entry and exit from this proc. Therefore we 900-- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp" 901-- before every non-local jump. 902-- 903-- This became necessary when the new codegen started bundling entire 904-- functions together into one proc, because the register allocator 905-- assigns a different stack slot to each virtual reg within a proc. 906-- To avoid using so many slots we could also: 907-- 908-- - split up the proc into connected components before code generator 909-- 910-- - rename the virtual regs, so that we re-use vreg names and hence 911-- stack slots for non-overlapping vregs. 912-- 913-- Note that when a block is both a non-local entry point (with an 914-- info table) and a local branch target, we have to split it into 915-- two, like so: 916-- 917-- <info table> 918-- L: 919-- <code> 920-- 921-- becomes 922-- 923-- <info table> 924-- L: 925-- subl $rsp, N 926-- jmp Lnew 927-- Lnew: 928-- <code> 929-- 930-- and all branches pointing to L are retargetted to point to Lnew. 931-- Otherwise, we would repeat the $rsp adjustment for each branch to 932-- L. 933-- 934-- Returns a list of (L,Lnew) pairs. 935-- 936allocMoreStack 937 :: Platform 938 -> Int 939 -> NatCmmDecl statics X86.Instr.Instr 940 -> UniqSM (NatCmmDecl statics X86.Instr.Instr, [(BlockId,BlockId)]) 941 942allocMoreStack _ _ top@(CmmData _ _) = return (top,[]) 943allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do 944 let entries = entryBlocks proc 945 946 uniqs <- replicateM (length entries) getUniqueM 947 948 let 949 delta = ((x + stackAlign - 1) `quot` stackAlign) * stackAlign -- round up 950 where x = slots * spillSlotSize platform -- sp delta 951 952 alloc = mkStackAllocInstr platform delta 953 dealloc = mkStackDeallocInstr platform delta 954 955 retargetList = (zip entries (map mkBlockId uniqs)) 956 957 new_blockmap :: LabelMap BlockId 958 new_blockmap = mapFromList retargetList 959 960 insert_stack_insns (BasicBlock id insns) 961 | Just new_blockid <- mapLookup id new_blockmap 962 = [ BasicBlock id $ alloc ++ [JXX ALWAYS new_blockid] 963 , BasicBlock new_blockid block' ] 964 | otherwise 965 = [ BasicBlock id block' ] 966 where 967 block' = foldr insert_dealloc [] insns 968 969 insert_dealloc insn r = case insn of 970 JMP _ _ -> dealloc ++ (insn : r) 971 JXX_GBL _ _ -> panic "insert_dealloc: cannot handle JXX_GBL" 972 _other -> x86_patchJumpInstr insn retarget : r 973 where retarget b = fromMaybe b (mapLookup b new_blockmap) 974 975 new_code = concatMap insert_stack_insns code 976 -- in 977 return (CmmProc info lbl live (ListGraph new_code), retargetList) 978 979data JumpDest = DestBlockId BlockId | DestImm Imm 980 981-- Debug Instance 982instance Outputable JumpDest where 983 ppr (DestBlockId bid) = text "jd<blk>:" <> ppr bid 984 ppr (DestImm _imm) = text "jd<imm>:noShow" 985 986 987getJumpDestBlockId :: JumpDest -> Maybe BlockId 988getJumpDestBlockId (DestBlockId bid) = Just bid 989getJumpDestBlockId _ = Nothing 990 991canShortcut :: Instr -> Maybe JumpDest 992canShortcut (JXX ALWAYS id) = Just (DestBlockId id) 993canShortcut (JMP (OpImm imm) _) = Just (DestImm imm) 994canShortcut _ = Nothing 995 996 997-- This helper shortcuts a sequence of branches. 998-- The blockset helps avoid following cycles. 999shortcutJump :: (BlockId -> Maybe JumpDest) -> Instr -> Instr 1000shortcutJump fn insn = shortcutJump' fn (setEmpty :: LabelSet) insn 1001 where 1002 shortcutJump' :: (BlockId -> Maybe JumpDest) -> LabelSet -> Instr -> Instr 1003 shortcutJump' fn seen insn@(JXX cc id) = 1004 if setMember id seen then insn 1005 else case fn id of 1006 Nothing -> insn 1007 Just (DestBlockId id') -> shortcutJump' fn seen' (JXX cc id') 1008 Just (DestImm imm) -> shortcutJump' fn seen' (JXX_GBL cc imm) 1009 where seen' = setInsert id seen 1010 shortcutJump' fn _ (JMP_TBL addr blocks section tblId) = 1011 let updateBlock (Just (DestBlockId bid)) = 1012 case fn bid of 1013 Nothing -> Just (DestBlockId bid ) 1014 Just dest -> Just dest 1015 updateBlock dest = dest 1016 blocks' = map updateBlock blocks 1017 in JMP_TBL addr blocks' section tblId 1018 shortcutJump' _ _ other = other 1019 1020-- Here because it knows about JumpDest 1021shortcutStatics :: (BlockId -> Maybe JumpDest) -> (Alignment, CmmStatics) -> (Alignment, CmmStatics) 1022shortcutStatics fn (align, Statics lbl statics) 1023 = (align, Statics lbl $ map (shortcutStatic fn) statics) 1024 -- we need to get the jump tables, so apply the mapping to the entries 1025 -- of a CmmData too. 1026 1027shortcutLabel :: (BlockId -> Maybe JumpDest) -> CLabel -> CLabel 1028shortcutLabel fn lab 1029 | Just blkId <- maybeLocalBlockLabel lab = shortBlockId fn emptyUniqSet blkId 1030 | otherwise = lab 1031 1032shortcutStatic :: (BlockId -> Maybe JumpDest) -> CmmStatic -> CmmStatic 1033shortcutStatic fn (CmmStaticLit (CmmLabel lab)) 1034 = CmmStaticLit (CmmLabel (shortcutLabel fn lab)) 1035shortcutStatic fn (CmmStaticLit (CmmLabelDiffOff lbl1 lbl2 off w)) 1036 = CmmStaticLit (CmmLabelDiffOff (shortcutLabel fn lbl1) lbl2 off w) 1037 -- slightly dodgy, we're ignoring the second label, but this 1038 -- works with the way we use CmmLabelDiffOff for jump tables now. 1039shortcutStatic _ other_static 1040 = other_static 1041 1042shortBlockId 1043 :: (BlockId -> Maybe JumpDest) 1044 -> UniqSet Unique 1045 -> BlockId 1046 -> CLabel 1047 1048shortBlockId fn seen blockid = 1049 case (elementOfUniqSet uq seen, fn blockid) of 1050 (True, _) -> blockLbl blockid 1051 (_, Nothing) -> blockLbl blockid 1052 (_, Just (DestBlockId blockid')) -> shortBlockId fn (addOneToUniqSet seen uq) blockid' 1053 (_, Just (DestImm (ImmCLbl lbl))) -> lbl 1054 (_, _other) -> panic "shortBlockId" 1055 where uq = getUnique blockid 1056