1{-# LANGUAGE GADTs #-} 2module CmmSink ( 3 cmmSink 4 ) where 5 6import GhcPrelude 7 8import Cmm 9import CmmOpt 10import CmmLive 11import CmmUtils 12import Hoopl.Block 13import Hoopl.Label 14import Hoopl.Collections 15import Hoopl.Graph 16import GHC.Platform.Regs 17import GHC.Platform (isARM, platformArch) 18 19import DynFlags 20import Unique 21import UniqFM 22 23import qualified Data.IntSet as IntSet 24import Data.List (partition) 25import qualified Data.Set as Set 26import Data.Maybe 27 28-- Compact sets for membership tests of local variables. 29 30type LRegSet = IntSet.IntSet 31 32emptyLRegSet :: LRegSet 33emptyLRegSet = IntSet.empty 34 35nullLRegSet :: LRegSet -> Bool 36nullLRegSet = IntSet.null 37 38insertLRegSet :: LocalReg -> LRegSet -> LRegSet 39insertLRegSet l = IntSet.insert (getKey (getUnique l)) 40 41elemLRegSet :: LocalReg -> LRegSet -> Bool 42elemLRegSet l = IntSet.member (getKey (getUnique l)) 43 44-- ----------------------------------------------------------------------------- 45-- Sinking and inlining 46 47-- This is an optimisation pass that 48-- (a) moves assignments closer to their uses, to reduce register pressure 49-- (b) pushes assignments into a single branch of a conditional if possible 50-- (c) inlines assignments to registers that are mentioned only once 51-- (d) discards dead assignments 52-- 53-- This tightens up lots of register-heavy code. It is particularly 54-- helpful in the Cmm generated by the Stg->Cmm code generator, in 55-- which every function starts with a copyIn sequence like: 56-- 57-- x1 = R1 58-- x2 = Sp[8] 59-- x3 = Sp[16] 60-- if (Sp - 32 < SpLim) then L1 else L2 61-- 62-- we really want to push the x1..x3 assignments into the L2 branch. 63-- 64-- Algorithm: 65-- 66-- * Start by doing liveness analysis. 67-- 68-- * Keep a list of assignments A; earlier ones may refer to later ones. 69-- Currently we only sink assignments to local registers, because we don't 70-- have liveness information about global registers. 71-- 72-- * Walk forwards through the graph, look at each node N: 73-- 74-- * If it is a dead assignment, i.e. assignment to a register that is 75-- not used after N, discard it. 76-- 77-- * Try to inline based on current list of assignments 78-- * If any assignments in A (1) occur only once in N, and (2) are 79-- not live after N, inline the assignment and remove it 80-- from A. 81-- 82-- * If an assignment in A is cheap (RHS is local register), then 83-- inline the assignment and keep it in A in case it is used afterwards. 84-- 85-- * Otherwise don't inline. 86-- 87-- * If N is assignment to a local register pick up the assignment 88-- and add it to A. 89-- 90-- * If N is not an assignment to a local register: 91-- * remove any assignments from A that conflict with N, and 92-- place them before N in the current block. We call this 93-- "dropping" the assignments. 94-- 95-- * An assignment conflicts with N if it: 96-- - assigns to a register mentioned in N 97-- - mentions a register assigned by N 98-- - reads from memory written by N 99-- * do this recursively, dropping dependent assignments 100-- 101-- * At an exit node: 102-- * drop any assignments that are live on more than one successor 103-- and are not trivial 104-- * if any successor has more than one predecessor (a join-point), 105-- drop everything live in that successor. Since we only propagate 106-- assignments that are not dead at the successor, we will therefore 107-- eliminate all assignments dead at this point. Thus analysis of a 108-- join-point will always begin with an empty list of assignments. 109-- 110-- 111-- As a result of above algorithm, sinking deletes some dead assignments 112-- (transitively, even). This isn't as good as removeDeadAssignments, 113-- but it's much cheaper. 114 115-- ----------------------------------------------------------------------------- 116-- things that we aren't optimising very well yet. 117-- 118-- ----------- 119-- (1) From GHC's FastString.hashStr: 120-- 121-- s2ay: 122-- if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp; 123-- c2gn: 124-- R1 = _s2au::I64; 125-- call (I64[Sp])(R1) args: 8, res: 0, upd: 8; 126-- c2gp: 127-- _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128, 128-- 4091); 129-- _s2an::I64 = _s2an::I64 + 1; 130-- _s2au::I64 = _s2cO::I64; 131-- goto s2ay; 132-- 133-- a nice loop, but we didn't eliminate the silly assignment at the end. 134-- See Note [dependent assignments], which would probably fix this. 135-- This is #8336. 136-- 137-- ----------- 138-- (2) From stg_atomically_frame in PrimOps.cmm 139-- 140-- We have a diamond control flow: 141-- 142-- x = ... 143-- | 144-- / \ 145-- A B 146-- \ / 147-- | 148-- use of x 149-- 150-- Now x won't be sunk down to its use, because we won't push it into 151-- both branches of the conditional. We certainly do have to check 152-- that we can sink it past all the code in both A and B, but having 153-- discovered that, we could sink it to its use. 154-- 155 156-- ----------------------------------------------------------------------------- 157 158type Assignment = (LocalReg, CmmExpr, AbsMem) 159 -- Assignment caches AbsMem, an abstraction of the memory read by 160 -- the RHS of the assignment. 161 162type Assignments = [Assignment] 163 -- A sequence of assignments; kept in *reverse* order 164 -- So the list [ x=e1, y=e2 ] means the sequence of assignments 165 -- y = e2 166 -- x = e1 167 168cmmSink :: DynFlags -> CmmGraph -> CmmGraph 169cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks 170 where 171 liveness = cmmLocalLiveness dflags graph 172 getLive l = mapFindWithDefault Set.empty l liveness 173 174 blocks = revPostorder graph 175 176 join_pts = findJoinPoints blocks 177 178 sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock] 179 sink _ [] = [] 180 sink sunk (b:bs) = 181 -- pprTrace "sink" (ppr lbl) $ 182 blockJoin first final_middle final_last : sink sunk' bs 183 where 184 lbl = entryLabel b 185 (first, middle, last) = blockSplit b 186 187 succs = successors last 188 189 -- Annotate the middle nodes with the registers live *after* 190 -- the node. This will help us decide whether we can inline 191 -- an assignment in the current node or not. 192 live = Set.unions (map getLive succs) 193 live_middle = gen_kill dflags last live 194 ann_middles = annotate dflags live_middle (blockToList middle) 195 196 -- Now sink and inline in this block 197 (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk) 198 fold_last = constantFoldNode dflags last 199 (final_last, assigs') = tryToInline dflags live fold_last assigs 200 201 -- We cannot sink into join points (successors with more than 202 -- one predecessor), so identify the join points and the set 203 -- of registers live in them. 204 (joins, nonjoins) = partition (`mapMember` join_pts) succs 205 live_in_joins = Set.unions (map getLive joins) 206 207 -- We do not want to sink an assignment into multiple branches, 208 -- so identify the set of registers live in multiple successors. 209 -- This is made more complicated because when we sink an assignment 210 -- into one branch, this might change the set of registers that are 211 -- now live in multiple branches. 212 init_live_sets = map getLive nonjoins 213 live_in_multi live_sets r = 214 case filter (Set.member r) live_sets of 215 (_one:_two:_) -> True 216 _ -> False 217 218 -- Now, drop any assignments that we will not sink any further. 219 (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs' 220 221 drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets') 222 where 223 should_drop = conflicts dflags a final_last 224 || not (isTrivial dflags rhs) && live_in_multi live_sets r 225 || r `Set.member` live_in_joins 226 227 live_sets' | should_drop = live_sets 228 | otherwise = map upd live_sets 229 230 upd set | r `Set.member` set = set `Set.union` live_rhs 231 | otherwise = set 232 233 live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs 234 235 final_middle = foldl' blockSnoc middle' dropped_last 236 237 sunk' = mapUnion sunk $ 238 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'') 239 | l <- succs ] 240 241{- TODO: enable this later, when we have some good tests in place to 242 measure the effect and tune it. 243 244-- small: an expression we don't mind duplicating 245isSmall :: CmmExpr -> Bool 246isSmall (CmmReg (CmmLocal _)) = True -- 247isSmall (CmmLit _) = True 248isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y 249isSmall (CmmRegOff (CmmLocal _) _) = True 250isSmall _ = False 251-} 252 253-- 254-- We allow duplication of trivial expressions: registers (both local and 255-- global) and literals. 256-- 257isTrivial :: DynFlags -> CmmExpr -> Bool 258isTrivial _ (CmmReg (CmmLocal _)) = True 259isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?] 260 if isARM (platformArch (targetPlatform dflags)) 261 then True -- CodeGen.Platform.ARM does not have globalRegMaybe 262 else isJust (globalRegMaybe (targetPlatform dflags) r) 263 -- GlobalRegs that are loads from BaseReg are not trivial 264isTrivial _ (CmmLit _) = True 265isTrivial _ _ = False 266 267-- 268-- annotate each node with the set of registers live *after* the node 269-- 270annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)] 271annotate dflags live nodes = snd $ foldr ann (live,[]) nodes 272 where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes) 273 274-- 275-- Find the blocks that have multiple successors (join points) 276-- 277findJoinPoints :: [CmmBlock] -> LabelMap Int 278findJoinPoints blocks = mapFilter (>1) succ_counts 279 where 280 all_succs = concatMap successors blocks 281 282 succ_counts :: LabelMap Int 283 succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs 284 285-- 286-- filter the list of assignments to remove any assignments that 287-- are not live in a continuation. 288-- 289filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments 290filterAssignments dflags live assigs = reverse (go assigs []) 291 where go [] kept = kept 292 go (a@(r,_,_):as) kept | needed = go as (a:kept) 293 | otherwise = go as kept 294 where 295 needed = r `Set.member` live 296 || any (conflicts dflags a) (map toNode kept) 297 -- Note that we must keep assignments that are 298 -- referred to by other assignments we have 299 -- already kept. 300 301-- ----------------------------------------------------------------------------- 302-- Walk through the nodes of a block, sinking and inlining assignments 303-- as we go. 304-- 305-- On input we pass in a: 306-- * list of nodes in the block 307-- * a list of assignments that appeared *before* this block and 308-- that are being sunk. 309-- 310-- On output we get: 311-- * a new block 312-- * a list of assignments that will be placed *after* that block. 313-- 314 315walk :: DynFlags 316 -> [(LocalRegSet, CmmNode O O)] -- nodes of the block, annotated with 317 -- the set of registers live *after* 318 -- this node. 319 320 -> Assignments -- The current list of 321 -- assignments we are sinking. 322 -- Earlier assignments may refer 323 -- to later ones. 324 325 -> ( Block CmmNode O O -- The new block 326 , Assignments -- Assignments to sink further 327 ) 328 329walk dflags nodes assigs = go nodes emptyBlock assigs 330 where 331 go [] block as = (block, as) 332 go ((live,node):ns) block as 333 | shouldDiscard node live = go ns block as 334 -- discard dead assignment 335 | Just a <- shouldSink dflags node2 = go ns block (a : as1) 336 | otherwise = go ns block' as' 337 where 338 node1 = constantFoldNode dflags node 339 340 (node2, as1) = tryToInline dflags live node1 as 341 342 (dropped, as') = dropAssignmentsSimple dflags 343 (\a -> conflicts dflags a node2) as1 344 345 block' = foldl' blockSnoc block dropped `blockSnoc` node2 346 347 348-- 349-- Heuristic to decide whether to pick up and sink an assignment 350-- Currently we pick up all assignments to local registers. It might 351-- be profitable to sink assignments to global regs too, but the 352-- liveness analysis doesn't track those (yet) so we can't. 353-- 354shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment 355shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e) 356 where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e 357shouldSink _ _other = Nothing 358 359-- 360-- discard dead assignments. This doesn't do as good a job as 361-- removeDeadAssignments, because it would need multiple passes 362-- to get all the dead code, but it catches the common case of 363-- superfluous reloads from the stack that the stack allocator 364-- leaves behind. 365-- 366-- Also we catch "r = r" here. You might think it would fall 367-- out of inlining, but the inliner will see that r is live 368-- after the instruction and choose not to inline r in the rhs. 369-- 370shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool 371shouldDiscard node live 372 = case node of 373 CmmAssign r (CmmReg r') | r == r' -> True 374 CmmAssign (CmmLocal r) _ -> not (r `Set.member` live) 375 _otherwise -> False 376 377 378toNode :: Assignment -> CmmNode O O 379toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs 380 381dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments 382 -> ([CmmNode O O], Assignments) 383dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) () 384 385dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments 386 -> ([CmmNode O O], Assignments) 387dropAssignments dflags should_drop state assigs 388 = (dropped, reverse kept) 389 where 390 (dropped,kept) = go state assigs [] [] 391 392 go _ [] dropped kept = (dropped, kept) 393 go state (assig : rest) dropped kept 394 | conflict = go state' rest (toNode assig : dropped) kept 395 | otherwise = go state' rest dropped (assig:kept) 396 where 397 (dropit, state') = should_drop assig state 398 conflict = dropit || any (conflicts dflags assig) dropped 399 400 401-- ----------------------------------------------------------------------------- 402-- Try to inline assignments into a node. 403-- This also does constant folding for primpops, since 404-- inlining opens up opportunities for doing so. 405 406tryToInline 407 :: DynFlags 408 -> LocalRegSet -- set of registers live after this 409 -- node. We cannot inline anything 410 -- that is live after the node, unless 411 -- it is small enough to duplicate. 412 -> CmmNode O x -- The node to inline into 413 -> Assignments -- Assignments to inline 414 -> ( 415 CmmNode O x -- New node 416 , Assignments -- Remaining assignments 417 ) 418 419tryToInline dflags live node assigs = go usages node emptyLRegSet assigs 420 where 421 usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used 422 usages = foldLocalRegsUsed dflags addUsage emptyUFM node 423 424 go _usages node _skipped [] = (node, []) 425 426 go usages node skipped (a@(l,rhs,_) : rest) 427 | cannot_inline = dont_inline 428 | occurs_none = discard -- Note [discard during inlining] 429 | occurs_once = inline_and_discard 430 | isTrivial dflags rhs = inline_and_keep 431 | otherwise = dont_inline 432 where 433 inline_and_discard = go usages' inl_node skipped rest 434 where usages' = foldLocalRegsUsed dflags addUsage usages rhs 435 436 discard = go usages node skipped rest 437 438 dont_inline = keep node -- don't inline the assignment, keep it 439 inline_and_keep = keep inl_node -- inline the assignment, keep it 440 441 keep node' = (final_node, a : rest') 442 where (final_node, rest') = go usages' node' (insertLRegSet l skipped) rest 443 usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2) 444 usages rhs 445 -- we must not inline anything that is mentioned in the RHS 446 -- of a binding that we have already skipped, so we set the 447 -- usages of the regs on the RHS to 2. 448 449 cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments] 450 || l `elemLRegSet` skipped 451 || not (okToInline dflags rhs node) 452 453 l_usages = lookupUFM usages l 454 l_live = l `elemRegSet` live 455 456 occurs_once = not l_live && l_usages == Just 1 457 occurs_none = not l_live && l_usages == Nothing 458 459 inl_node = improveConditional (mapExpDeep inl_exp node) 460 461 inl_exp :: CmmExpr -> CmmExpr 462 -- inl_exp is where the inlining actually takes place! 463 inl_exp (CmmReg (CmmLocal l')) | l == l' = rhs 464 inl_exp (CmmRegOff (CmmLocal l') off) | l == l' 465 = cmmOffset dflags rhs off 466 -- re-constant fold after inlining 467 inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args 468 inl_exp other = other 469 470 471{- Note [improveConditional] 472 473cmmMachOpFold tries to simplify conditionals to turn things like 474 (a == b) != 1 475into 476 (a != b) 477but there's one case it can't handle: when the comparison is over 478floating-point values, we can't invert it, because floating-point 479comparisons aren't invertible (because of NaNs). 480 481But we *can* optimise this conditional by swapping the true and false 482branches. Given 483 CmmCondBranch ((a >## b) != 1) t f 484we can turn it into 485 CmmCondBranch (a >## b) f t 486 487So here we catch conditionals that weren't optimised by cmmMachOpFold, 488and apply above transformation to eliminate the comparison against 1. 489 490It's tempting to just turn every != into == and then let cmmMachOpFold 491do its thing, but that risks changing a nice fall-through conditional 492into one that requires two jumps. (see swapcond_last in 493CmmContFlowOpt), so instead we carefully look for just the cases where 494we can eliminate a comparison. 495-} 496improveConditional :: CmmNode O x -> CmmNode O x 497improveConditional 498 (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l) 499 | neLike mop, isComparisonExpr x 500 = CmmCondBranch x f t (fmap not l) 501 where 502 neLike (MO_Ne _) = True 503 neLike (MO_U_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1 504 neLike (MO_S_Lt _) = True -- (x<y) < 1 behaves like (x<y) != 1 505 neLike _ = False 506improveConditional other = other 507 508-- Note [dependent assignments] 509-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 510-- 511-- If our assignment list looks like 512-- 513-- [ y = e, x = ... y ... ] 514-- 515-- We cannot inline x. Remember this list is really in reverse order, 516-- so it means x = ... y ...; y = e 517-- 518-- Hence if we inline x, the outer assignment to y will capture the 519-- reference in x's right hand side. 520-- 521-- In this case we should rename the y in x's right-hand side, 522-- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ] 523-- Now we can go ahead and inline x. 524-- 525-- For now we do nothing, because this would require putting 526-- everything inside UniqSM. 527-- 528-- One more variant of this (#7366): 529-- 530-- [ y = e, y = z ] 531-- 532-- If we don't want to inline y = e, because y is used many times, we 533-- might still be tempted to inline y = z (because we always inline 534-- trivial rhs's). But of course we can't, because y is equal to e, 535-- not z. 536 537-- Note [discard during inlining] 538-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 539-- Opportunities to discard assignments sometimes appear after we've 540-- done some inlining. Here's an example: 541-- 542-- x = R1; 543-- y = P64[x + 7]; 544-- z = P64[x + 15]; 545-- /* z is dead */ 546-- R1 = y & (-8); 547-- 548-- The x assignment is trivial, so we inline it in the RHS of y, and 549-- keep both x and y. z gets dropped because it is dead, then we 550-- inline y, and we have a dead assignment to x. If we don't notice 551-- that x is dead in tryToInline, we end up retaining it. 552 553addUsage :: UniqFM Int -> LocalReg -> UniqFM Int 554addUsage m r = addToUFM_C (+) m r 1 555 556regsUsedIn :: LRegSet -> CmmExpr -> Bool 557regsUsedIn ls _ | nullLRegSet ls = False 558regsUsedIn ls e = wrapRecExpf f e False 559 where f (CmmReg (CmmLocal l)) _ | l `elemLRegSet` ls = True 560 f (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True 561 f _ z = z 562 563-- we don't inline into CmmUnsafeForeignCall if the expression refers 564-- to global registers. This is a HACK to avoid global registers 565-- clashing with C argument-passing registers, really the back-end 566-- ought to be able to handle it properly, but currently neither PprC 567-- nor the NCG can do it. See Note [Register parameter passing] 568-- See also GHC.StgToCmm.Foreign.load_args_into_temps. 569okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool 570okToInline dflags expr node@(CmmUnsafeForeignCall{}) = 571 not (globalRegistersConflict dflags expr node) 572okToInline _ _ _ = True 573 574-- ----------------------------------------------------------------------------- 575 576-- | @conflicts (r,e) node@ is @False@ if and only if the assignment 577-- @r = e@ can be safely commuted past statement @node@. 578conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool 579conflicts dflags (r, rhs, addr) node 580 581 -- (1) node defines registers used by rhs of assignment. This catches 582 -- assignments and all three kinds of calls. See Note [Sinking and calls] 583 | globalRegistersConflict dflags rhs node = True 584 | localRegistersConflict dflags rhs node = True 585 586 -- (2) node uses register defined by assignment 587 | foldRegsUsed dflags (\b r' -> r == r' || b) False node = True 588 589 -- (3) a store to an address conflicts with a read of the same memory 590 | CmmStore addr' e <- node 591 , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True 592 593 -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively 594 | HeapMem <- addr, CmmAssign (CmmGlobal Hp) _ <- node = True 595 | StackMem <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True 596 | SpMem{} <- addr, CmmAssign (CmmGlobal Sp) _ <- node = True 597 598 -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap] 599 | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem = True 600 601 -- (6) native calls clobber any memory 602 | CmmCall{} <- node, memConflicts addr AnyMem = True 603 604 -- (7) otherwise, no conflict 605 | otherwise = False 606 607-- Returns True if node defines any global registers that are used in the 608-- Cmm expression 609globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool 610globalRegistersConflict dflags expr node = 611 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr) 612 False node 613 614-- Returns True if node defines any local registers that are used in the 615-- Cmm expression 616localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool 617localRegistersConflict dflags expr node = 618 foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal r) expr) 619 False node 620 621-- Note [Sinking and calls] 622-- ~~~~~~~~~~~~~~~~~~~~~~~~ 623-- 624-- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall) 625-- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after 626-- stack layout (see Note [Sinking after stack layout]) which leads to two 627-- invariants related to calls: 628-- 629-- a) during stack layout phase all safe foreign calls are turned into 630-- unsafe foreign calls (see Note [Lower safe foreign calls]). This 631-- means that we will never encounter CmmForeignCall node when running 632-- sinking after stack layout 633-- 634-- b) stack layout saves all variables live across a call on the stack 635-- just before making a call (remember we are not sinking assignments to 636-- stack): 637-- 638-- L1: 639-- x = R1 640-- P64[Sp - 16] = L2 641-- P64[Sp - 8] = x 642-- Sp = Sp - 16 643-- call f() returns L2 644-- L2: 645-- 646-- We will attempt to sink { x = R1 } but we will detect conflict with 647-- { P64[Sp - 8] = x } and hence we will drop { x = R1 } without even 648-- checking whether it conflicts with { call f() }. In this way we will 649-- never need to check any assignment conflicts with CmmCall. Remember 650-- that we still need to check for potential memory conflicts. 651-- 652-- So the result is that we only need to worry about CmmUnsafeForeignCall nodes 653-- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]). 654-- This assumption holds only when we do sinking after stack layout. If we run 655-- it before stack layout we need to check for possible conflicts with all three 656-- kinds of calls. Our `conflicts` function does that by using a generic 657-- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and 658-- UserOfRegs typeclasses. 659-- 660 661-- An abstraction of memory read or written. 662data AbsMem 663 = NoMem -- no memory accessed 664 | AnyMem -- arbitrary memory 665 | HeapMem -- definitely heap memory 666 | StackMem -- definitely stack memory 667 | SpMem -- <size>[Sp+n] 668 {-# UNPACK #-} !Int 669 {-# UNPACK #-} !Int 670 671-- Having SpMem is important because it lets us float loads from Sp 672-- past stores to Sp as long as they don't overlap, and this helps to 673-- unravel some long sequences of 674-- x1 = [Sp + 8] 675-- x2 = [Sp + 16] 676-- ... 677-- [Sp + 8] = xi 678-- [Sp + 16] = xj 679-- 680-- Note that SpMem is invalidated if Sp is changed, but the definition 681-- of 'conflicts' above handles that. 682 683-- ToDo: this won't currently fix the following commonly occurring code: 684-- x1 = [R1 + 8] 685-- x2 = [R1 + 16] 686-- .. 687-- [Hp - 8] = x1 688-- [Hp - 16] = x2 689-- .. 690 691-- because [R1 + 8] and [Hp - 8] are both HeapMem. We know that 692-- assignments to [Hp + n] do not conflict with any other heap memory, 693-- but this is tricky to nail down. What if we had 694-- 695-- x = Hp + n 696-- [x] = ... 697-- 698-- the store to [x] should be "new heap", not "old heap". 699-- Furthermore, you could imagine that if we started inlining 700-- functions in Cmm then there might well be reads of heap memory 701-- that was written in the same basic block. To take advantage of 702-- non-aliasing of heap memory we will have to be more clever. 703 704-- Note [Foreign calls clobber heap] 705-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 706-- 707-- It is tempting to say that foreign calls clobber only 708-- non-heap/stack memory, but unfortunately we break this invariant in 709-- the RTS. For example, in stg_catch_retry_frame we call 710-- stmCommitNestedTransaction() which modifies the contents of the 711-- TRec it is passed (this actually caused incorrect code to be 712-- generated). 713-- 714-- Since the invariant is true for the majority of foreign calls, 715-- perhaps we ought to have a special annotation for calls that can 716-- modify heap/stack memory. For now we just use the conservative 717-- definition here. 718-- 719-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and 720-- therefore we should never float any memory operations across one of 721-- these calls. 722 723 724bothMems :: AbsMem -> AbsMem -> AbsMem 725bothMems NoMem x = x 726bothMems x NoMem = x 727bothMems HeapMem HeapMem = HeapMem 728bothMems StackMem StackMem = StackMem 729bothMems (SpMem o1 w1) (SpMem o2 w2) 730 | o1 == o2 = SpMem o1 (max w1 w2) 731 | otherwise = StackMem 732bothMems SpMem{} StackMem = StackMem 733bothMems StackMem SpMem{} = StackMem 734bothMems _ _ = AnyMem 735 736memConflicts :: AbsMem -> AbsMem -> Bool 737memConflicts NoMem _ = False 738memConflicts _ NoMem = False 739memConflicts HeapMem StackMem = False 740memConflicts StackMem HeapMem = False 741memConflicts SpMem{} HeapMem = False 742memConflicts HeapMem SpMem{} = False 743memConflicts (SpMem o1 w1) (SpMem o2 w2) 744 | o1 < o2 = o1 + w1 > o2 745 | otherwise = o2 + w2 > o1 746memConflicts _ _ = True 747 748exprMem :: DynFlags -> CmmExpr -> AbsMem 749exprMem dflags (CmmLoad addr w) = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr) 750exprMem dflags (CmmMachOp _ es) = foldr bothMems NoMem (map (exprMem dflags) es) 751exprMem _ _ = NoMem 752 753loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem 754loadAddr dflags e w = 755 case e of 756 CmmReg r -> regAddr dflags r 0 w 757 CmmRegOff r i -> regAddr dflags r i w 758 _other | regUsedIn dflags spReg e -> StackMem 759 | otherwise -> AnyMem 760 761regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem 762regAddr _ (CmmGlobal Sp) i w = SpMem i (widthInBytes w) 763regAddr _ (CmmGlobal Hp) _ _ = HeapMem 764regAddr _ (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps 765regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself 766regAddr _ _ _ _ = AnyMem 767 768{- 769Note [Inline GlobalRegs?] 770 771Should we freely inline GlobalRegs? 772 773Actually it doesn't make a huge amount of difference either way, so we 774*do* currently treat GlobalRegs as "trivial" and inline them 775everywhere, but for what it's worth, here is what I discovered when I 776(SimonM) looked into this: 777 778Common sense says we should not inline GlobalRegs, because when we 779have 780 781 x = R1 782 783the register allocator will coalesce this assignment, generating no 784code, and simply record the fact that x is bound to $rbx (or 785whatever). Furthermore, if we were to sink this assignment, then the 786range of code over which R1 is live increases, and the range of code 787over which x is live decreases. All things being equal, it is better 788for x to be live than R1, because R1 is a fixed register whereas x can 789live in any register. So we should neither sink nor inline 'x = R1'. 790 791However, not inlining GlobalRegs can have surprising 792consequences. e.g. (cgrun020) 793 794 c3EN: 795 _s3DB::P64 = R1; 796 _c3ES::P64 = _s3DB::P64 & 7; 797 if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV; 798 c3EU: 799 _s3DD::P64 = P64[_s3DB::P64 + 6]; 800 _s3DE::P64 = P64[_s3DB::P64 + 14]; 801 I64[Sp - 8] = c3F0; 802 R1 = _s3DE::P64; 803 P64[Sp] = _s3DD::P64; 804 805inlining the GlobalReg gives: 806 807 c3EN: 808 if (R1 & 7 >= 2) goto c3EU; else goto c3EV; 809 c3EU: 810 I64[Sp - 8] = c3F0; 811 _s3DD::P64 = P64[R1 + 6]; 812 R1 = P64[R1 + 14]; 813 P64[Sp] = _s3DD::P64; 814 815but if we don't inline the GlobalReg, instead we get: 816 817 _s3DB::P64 = R1; 818 if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV; 819 c3EU: 820 I64[Sp - 8] = c3F0; 821 R1 = P64[_s3DB::P64 + 14]; 822 P64[Sp] = P64[_s3DB::P64 + 6]; 823 824This looks better - we managed to inline _s3DD - but in fact it 825generates an extra reg-reg move: 826 827.Lc3EU: 828 movq $c3F0_info,-8(%rbp) 829 movq %rbx,%rax 830 movq 14(%rbx),%rbx 831 movq 6(%rax),%rax 832 movq %rax,(%rbp) 833 834because _s3DB is now live across the R1 assignment, we lost the 835benefit of coalescing. 836 837Who is at fault here? Perhaps if we knew that _s3DB was an alias for 838R1, then we would not sink a reference to _s3DB past the R1 839assignment. Or perhaps we *should* do that - we might gain by sinking 840it, despite losing the coalescing opportunity. 841 842Sometimes not inlining global registers wins by virtue of the rule 843about not inlining into arguments of a foreign call, e.g. (T7163) this 844is what happens when we inlined F1: 845 846 _s3L2::F32 = F1; 847 _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32); 848 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(_c3O3::F32); 849 850but if we don't inline F1: 851 852 (_s3L7::F32) = call "ccall" arg hints: [] result hints: [] rintFloat(%MO_F_Mul_W32(_s3L2::F32, 853 10.0 :: W32)); 854-} 855