1{-# LANGUAGE GADTs #-}
2module CmmSink (
3     cmmSink
4  ) where
5
6import GhcPrelude
7
8import Cmm
9import CmmOpt
10import CmmLive
11import CmmUtils
12import Hoopl.Block
13import Hoopl.Label
14import Hoopl.Collections
15import Hoopl.Graph
16import GHC.Platform.Regs
17import GHC.Platform (isARM, platformArch)
18
19import DynFlags
20import Unique
21import UniqFM
22
23import qualified Data.IntSet as IntSet
24import Data.List (partition)
25import qualified Data.Set as Set
26import Data.Maybe
27
28-- Compact sets for membership tests of local variables.
29
30type LRegSet = IntSet.IntSet
31
32emptyLRegSet :: LRegSet
33emptyLRegSet = IntSet.empty
34
35nullLRegSet :: LRegSet -> Bool
36nullLRegSet = IntSet.null
37
38insertLRegSet :: LocalReg -> LRegSet -> LRegSet
39insertLRegSet l = IntSet.insert (getKey (getUnique l))
40
41elemLRegSet :: LocalReg -> LRegSet -> Bool
42elemLRegSet l = IntSet.member (getKey (getUnique l))
43
44-- -----------------------------------------------------------------------------
45-- Sinking and inlining
46
47-- This is an optimisation pass that
48--  (a) moves assignments closer to their uses, to reduce register pressure
49--  (b) pushes assignments into a single branch of a conditional if possible
50--  (c) inlines assignments to registers that are mentioned only once
51--  (d) discards dead assignments
52--
53-- This tightens up lots of register-heavy code.  It is particularly
54-- helpful in the Cmm generated by the Stg->Cmm code generator, in
55-- which every function starts with a copyIn sequence like:
56--
57--    x1 = R1
58--    x2 = Sp[8]
59--    x3 = Sp[16]
60--    if (Sp - 32 < SpLim) then L1 else L2
61--
62-- we really want to push the x1..x3 assignments into the L2 branch.
63--
64-- Algorithm:
65--
66--  * Start by doing liveness analysis.
67--
68--  * Keep a list of assignments A; earlier ones may refer to later ones.
69--    Currently we only sink assignments to local registers, because we don't
70--    have liveness information about global registers.
71--
72--  * Walk forwards through the graph, look at each node N:
73--
74--    * If it is a dead assignment, i.e. assignment to a register that is
75--      not used after N, discard it.
76--
77--    * Try to inline based on current list of assignments
78--      * If any assignments in A (1) occur only once in N, and (2) are
79--        not live after N, inline the assignment and remove it
80--        from A.
81--
82--      * If an assignment in A is cheap (RHS is local register), then
83--        inline the assignment and keep it in A in case it is used afterwards.
84--
85--      * Otherwise don't inline.
86--
87--    * If N is assignment to a local register pick up the assignment
88--      and add it to A.
89--
90--    * If N is not an assignment to a local register:
91--      * remove any assignments from A that conflict with N, and
92--        place them before N in the current block.  We call this
93--        "dropping" the assignments.
94--
95--      * An assignment conflicts with N if it:
96--        - assigns to a register mentioned in N
97--        - mentions a register assigned by N
98--        - reads from memory written by N
99--      * do this recursively, dropping dependent assignments
100--
101--    * At an exit node:
102--      * drop any assignments that are live on more than one successor
103--        and are not trivial
104--      * if any successor has more than one predecessor (a join-point),
105--        drop everything live in that successor. Since we only propagate
106--        assignments that are not dead at the successor, we will therefore
107--        eliminate all assignments dead at this point. Thus analysis of a
108--        join-point will always begin with an empty list of assignments.
109--
110--
111-- As a result of above algorithm, sinking deletes some dead assignments
112-- (transitively, even).  This isn't as good as removeDeadAssignments,
113-- but it's much cheaper.
114
115-- -----------------------------------------------------------------------------
116-- things that we aren't optimising very well yet.
117--
118-- -----------
119-- (1) From GHC's FastString.hashStr:
120--
121--  s2ay:
122--      if ((_s2an::I64 == _s2ao::I64) >= 1) goto c2gn; else goto c2gp;
123--  c2gn:
124--      R1 = _s2au::I64;
125--      call (I64[Sp])(R1) args: 8, res: 0, upd: 8;
126--  c2gp:
127--      _s2cO::I64 = %MO_S_Rem_W64(%MO_UU_Conv_W8_W64(I8[_s2aq::I64 + (_s2an::I64 << 0)]) + _s2au::I64 * 128,
128--                                 4091);
129--      _s2an::I64 = _s2an::I64 + 1;
130--      _s2au::I64 = _s2cO::I64;
131--      goto s2ay;
132--
133-- a nice loop, but we didn't eliminate the silly assignment at the end.
134-- See Note [dependent assignments], which would probably fix this.
135-- This is #8336.
136--
137-- -----------
138-- (2) From stg_atomically_frame in PrimOps.cmm
139--
140-- We have a diamond control flow:
141--
142--     x = ...
143--       |
144--      / \
145--     A   B
146--      \ /
147--       |
148--    use of x
149--
150-- Now x won't be sunk down to its use, because we won't push it into
151-- both branches of the conditional.  We certainly do have to check
152-- that we can sink it past all the code in both A and B, but having
153-- discovered that, we could sink it to its use.
154--
155
156-- -----------------------------------------------------------------------------
157
158type Assignment = (LocalReg, CmmExpr, AbsMem)
159  -- Assignment caches AbsMem, an abstraction of the memory read by
160  -- the RHS of the assignment.
161
162type Assignments = [Assignment]
163  -- A sequence of assignments; kept in *reverse* order
164  -- So the list [ x=e1, y=e2 ] means the sequence of assignments
165  --     y = e2
166  --     x = e1
167
168cmmSink :: DynFlags -> CmmGraph -> CmmGraph
169cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
170  where
171  liveness = cmmLocalLiveness dflags graph
172  getLive l = mapFindWithDefault Set.empty l liveness
173
174  blocks = revPostorder graph
175
176  join_pts = findJoinPoints blocks
177
178  sink :: LabelMap Assignments -> [CmmBlock] -> [CmmBlock]
179  sink _ [] = []
180  sink sunk (b:bs) =
181    -- pprTrace "sink" (ppr lbl) $
182    blockJoin first final_middle final_last : sink sunk' bs
183    where
184      lbl = entryLabel b
185      (first, middle, last) = blockSplit b
186
187      succs = successors last
188
189      -- Annotate the middle nodes with the registers live *after*
190      -- the node.  This will help us decide whether we can inline
191      -- an assignment in the current node or not.
192      live = Set.unions (map getLive succs)
193      live_middle = gen_kill dflags last live
194      ann_middles = annotate dflags live_middle (blockToList middle)
195
196      -- Now sink and inline in this block
197      (middle', assigs) = walk dflags ann_middles (mapFindWithDefault [] lbl sunk)
198      fold_last = constantFoldNode dflags last
199      (final_last, assigs') = tryToInline dflags live fold_last assigs
200
201      -- We cannot sink into join points (successors with more than
202      -- one predecessor), so identify the join points and the set
203      -- of registers live in them.
204      (joins, nonjoins) = partition (`mapMember` join_pts) succs
205      live_in_joins = Set.unions (map getLive joins)
206
207      -- We do not want to sink an assignment into multiple branches,
208      -- so identify the set of registers live in multiple successors.
209      -- This is made more complicated because when we sink an assignment
210      -- into one branch, this might change the set of registers that are
211      -- now live in multiple branches.
212      init_live_sets = map getLive nonjoins
213      live_in_multi live_sets r =
214         case filter (Set.member r) live_sets of
215           (_one:_two:_) -> True
216           _ -> False
217
218      -- Now, drop any assignments that we will not sink any further.
219      (dropped_last, assigs'') = dropAssignments dflags drop_if init_live_sets assigs'
220
221      drop_if a@(r,rhs,_) live_sets = (should_drop, live_sets')
222          where
223            should_drop =  conflicts dflags a final_last
224                        || not (isTrivial dflags rhs) && live_in_multi live_sets r
225                        || r `Set.member` live_in_joins
226
227            live_sets' | should_drop = live_sets
228                       | otherwise   = map upd live_sets
229
230            upd set | r `Set.member` set = set `Set.union` live_rhs
231                    | otherwise          = set
232
233            live_rhs = foldRegsUsed dflags extendRegSet emptyRegSet rhs
234
235      final_middle = foldl' blockSnoc middle' dropped_last
236
237      sunk' = mapUnion sunk $
238                 mapFromList [ (l, filterAssignments dflags (getLive l) assigs'')
239                             | l <- succs ]
240
241{- TODO: enable this later, when we have some good tests in place to
242   measure the effect and tune it.
243
244-- small: an expression we don't mind duplicating
245isSmall :: CmmExpr -> Bool
246isSmall (CmmReg (CmmLocal _)) = True  --
247isSmall (CmmLit _) = True
248isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
249isSmall (CmmRegOff (CmmLocal _) _) = True
250isSmall _ = False
251-}
252
253--
254-- We allow duplication of trivial expressions: registers (both local and
255-- global) and literals.
256--
257isTrivial :: DynFlags -> CmmExpr -> Bool
258isTrivial _ (CmmReg (CmmLocal _)) = True
259isTrivial dflags (CmmReg (CmmGlobal r)) = -- see Note [Inline GlobalRegs?]
260  if isARM (platformArch (targetPlatform dflags))
261  then True -- CodeGen.Platform.ARM does not have globalRegMaybe
262  else isJust (globalRegMaybe (targetPlatform dflags) r)
263  -- GlobalRegs that are loads from BaseReg are not trivial
264isTrivial _ (CmmLit _) = True
265isTrivial _ _          = False
266
267--
268-- annotate each node with the set of registers live *after* the node
269--
270annotate :: DynFlags -> LocalRegSet -> [CmmNode O O] -> [(LocalRegSet, CmmNode O O)]
271annotate dflags live nodes = snd $ foldr ann (live,[]) nodes
272  where ann n (live,nodes) = (gen_kill dflags n live, (live,n) : nodes)
273
274--
275-- Find the blocks that have multiple successors (join points)
276--
277findJoinPoints :: [CmmBlock] -> LabelMap Int
278findJoinPoints blocks = mapFilter (>1) succ_counts
279 where
280  all_succs = concatMap successors blocks
281
282  succ_counts :: LabelMap Int
283  succ_counts = foldr (\l -> mapInsertWith (+) l 1) mapEmpty all_succs
284
285--
286-- filter the list of assignments to remove any assignments that
287-- are not live in a continuation.
288--
289filterAssignments :: DynFlags -> LocalRegSet -> Assignments -> Assignments
290filterAssignments dflags live assigs = reverse (go assigs [])
291  where go []             kept = kept
292        go (a@(r,_,_):as) kept | needed    = go as (a:kept)
293                               | otherwise = go as kept
294           where
295              needed = r `Set.member` live
296                       || any (conflicts dflags a) (map toNode kept)
297                       --  Note that we must keep assignments that are
298                       -- referred to by other assignments we have
299                       -- already kept.
300
301-- -----------------------------------------------------------------------------
302-- Walk through the nodes of a block, sinking and inlining assignments
303-- as we go.
304--
305-- On input we pass in a:
306--    * list of nodes in the block
307--    * a list of assignments that appeared *before* this block and
308--      that are being sunk.
309--
310-- On output we get:
311--    * a new block
312--    * a list of assignments that will be placed *after* that block.
313--
314
315walk :: DynFlags
316     -> [(LocalRegSet, CmmNode O O)]    -- nodes of the block, annotated with
317                                        -- the set of registers live *after*
318                                        -- this node.
319
320     -> Assignments                     -- The current list of
321                                        -- assignments we are sinking.
322                                        -- Earlier assignments may refer
323                                        -- to later ones.
324
325     -> ( Block CmmNode O O             -- The new block
326        , Assignments                   -- Assignments to sink further
327        )
328
329walk dflags nodes assigs = go nodes emptyBlock assigs
330 where
331   go []               block as = (block, as)
332   go ((live,node):ns) block as
333    | shouldDiscard node live           = go ns block as
334       -- discard dead assignment
335    | Just a <- shouldSink dflags node2 = go ns block (a : as1)
336    | otherwise                         = go ns block' as'
337    where
338      node1 = constantFoldNode dflags node
339
340      (node2, as1) = tryToInline dflags live node1 as
341
342      (dropped, as') = dropAssignmentsSimple dflags
343                          (\a -> conflicts dflags a node2) as1
344
345      block' = foldl' blockSnoc block dropped `blockSnoc` node2
346
347
348--
349-- Heuristic to decide whether to pick up and sink an assignment
350-- Currently we pick up all assignments to local registers.  It might
351-- be profitable to sink assignments to global regs too, but the
352-- liveness analysis doesn't track those (yet) so we can't.
353--
354shouldSink :: DynFlags -> CmmNode e x -> Maybe Assignment
355shouldSink dflags (CmmAssign (CmmLocal r) e) | no_local_regs = Just (r, e, exprMem dflags e)
356  where no_local_regs = True -- foldRegsUsed (\_ _ -> False) True e
357shouldSink _ _other = Nothing
358
359--
360-- discard dead assignments.  This doesn't do as good a job as
361-- removeDeadAssignments, because it would need multiple passes
362-- to get all the dead code, but it catches the common case of
363-- superfluous reloads from the stack that the stack allocator
364-- leaves behind.
365--
366-- Also we catch "r = r" here.  You might think it would fall
367-- out of inlining, but the inliner will see that r is live
368-- after the instruction and choose not to inline r in the rhs.
369--
370shouldDiscard :: CmmNode e x -> LocalRegSet -> Bool
371shouldDiscard node live
372   = case node of
373       CmmAssign r (CmmReg r') | r == r' -> True
374       CmmAssign (CmmLocal r) _ -> not (r `Set.member` live)
375       _otherwise -> False
376
377
378toNode :: Assignment -> CmmNode O O
379toNode (r,rhs,_) = CmmAssign (CmmLocal r) rhs
380
381dropAssignmentsSimple :: DynFlags -> (Assignment -> Bool) -> Assignments
382                      -> ([CmmNode O O], Assignments)
383dropAssignmentsSimple dflags f = dropAssignments dflags (\a _ -> (f a, ())) ()
384
385dropAssignments :: DynFlags -> (Assignment -> s -> (Bool, s)) -> s -> Assignments
386                -> ([CmmNode O O], Assignments)
387dropAssignments dflags should_drop state assigs
388 = (dropped, reverse kept)
389 where
390   (dropped,kept) = go state assigs [] []
391
392   go _ []             dropped kept = (dropped, kept)
393   go state (assig : rest) dropped kept
394      | conflict  = go state' rest (toNode assig : dropped) kept
395      | otherwise = go state' rest dropped (assig:kept)
396      where
397        (dropit, state') = should_drop assig state
398        conflict = dropit || any (conflicts dflags assig) dropped
399
400
401-- -----------------------------------------------------------------------------
402-- Try to inline assignments into a node.
403-- This also does constant folding for primpops, since
404-- inlining opens up opportunities for doing so.
405
406tryToInline
407   :: DynFlags
408   -> LocalRegSet               -- set of registers live after this
409                                -- node.  We cannot inline anything
410                                -- that is live after the node, unless
411                                -- it is small enough to duplicate.
412   -> CmmNode O x               -- The node to inline into
413   -> Assignments               -- Assignments to inline
414   -> (
415        CmmNode O x             -- New node
416      , Assignments             -- Remaining assignments
417      )
418
419tryToInline dflags live node assigs = go usages node emptyLRegSet assigs
420 where
421  usages :: UniqFM Int -- Maps each LocalReg to a count of how often it is used
422  usages = foldLocalRegsUsed dflags addUsage emptyUFM node
423
424  go _usages node _skipped [] = (node, [])
425
426  go usages node skipped (a@(l,rhs,_) : rest)
427   | cannot_inline           = dont_inline
428   | occurs_none             = discard  -- Note [discard during inlining]
429   | occurs_once             = inline_and_discard
430   | isTrivial dflags rhs    = inline_and_keep
431   | otherwise               = dont_inline
432   where
433        inline_and_discard = go usages' inl_node skipped rest
434          where usages' = foldLocalRegsUsed dflags addUsage usages rhs
435
436        discard = go usages node skipped rest
437
438        dont_inline        = keep node  -- don't inline the assignment, keep it
439        inline_and_keep    = keep inl_node -- inline the assignment, keep it
440
441        keep node' = (final_node, a : rest')
442          where (final_node, rest') = go usages' node' (insertLRegSet l skipped) rest
443                usages' = foldLocalRegsUsed dflags (\m r -> addToUFM m r 2)
444                                            usages rhs
445                -- we must not inline anything that is mentioned in the RHS
446                -- of a binding that we have already skipped, so we set the
447                -- usages of the regs on the RHS to 2.
448
449        cannot_inline = skipped `regsUsedIn` rhs -- Note [dependent assignments]
450                        || l `elemLRegSet` skipped
451                        || not (okToInline dflags rhs node)
452
453        l_usages = lookupUFM usages l
454        l_live   = l `elemRegSet` live
455
456        occurs_once = not l_live && l_usages == Just 1
457        occurs_none = not l_live && l_usages == Nothing
458
459        inl_node = improveConditional (mapExpDeep inl_exp node)
460
461        inl_exp :: CmmExpr -> CmmExpr
462        -- inl_exp is where the inlining actually takes place!
463        inl_exp (CmmReg    (CmmLocal l'))     | l == l' = rhs
464        inl_exp (CmmRegOff (CmmLocal l') off) | l == l'
465                    = cmmOffset dflags rhs off
466                    -- re-constant fold after inlining
467        inl_exp (CmmMachOp op args) = cmmMachOpFold dflags op args
468        inl_exp other = other
469
470
471{- Note [improveConditional]
472
473cmmMachOpFold tries to simplify conditionals to turn things like
474  (a == b) != 1
475into
476  (a != b)
477but there's one case it can't handle: when the comparison is over
478floating-point values, we can't invert it, because floating-point
479comparisons aren't invertible (because of NaNs).
480
481But we *can* optimise this conditional by swapping the true and false
482branches. Given
483  CmmCondBranch ((a >## b) != 1) t f
484we can turn it into
485  CmmCondBranch (a >## b) f t
486
487So here we catch conditionals that weren't optimised by cmmMachOpFold,
488and apply above transformation to eliminate the comparison against 1.
489
490It's tempting to just turn every != into == and then let cmmMachOpFold
491do its thing, but that risks changing a nice fall-through conditional
492into one that requires two jumps. (see swapcond_last in
493CmmContFlowOpt), so instead we carefully look for just the cases where
494we can eliminate a comparison.
495-}
496improveConditional :: CmmNode O x -> CmmNode O x
497improveConditional
498  (CmmCondBranch (CmmMachOp mop [x, CmmLit (CmmInt 1 _)]) t f l)
499  | neLike mop, isComparisonExpr x
500  = CmmCondBranch x f t (fmap not l)
501  where
502    neLike (MO_Ne _) = True
503    neLike (MO_U_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
504    neLike (MO_S_Lt _) = True   -- (x<y) < 1 behaves like (x<y) != 1
505    neLike _ = False
506improveConditional other = other
507
508-- Note [dependent assignments]
509-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
510--
511-- If our assignment list looks like
512--
513--    [ y = e,  x = ... y ... ]
514--
515-- We cannot inline x.  Remember this list is really in reverse order,
516-- so it means  x = ... y ...; y = e
517--
518-- Hence if we inline x, the outer assignment to y will capture the
519-- reference in x's right hand side.
520--
521-- In this case we should rename the y in x's right-hand side,
522-- i.e. change the list to [ y = e, x = ... y1 ..., y1 = y ]
523-- Now we can go ahead and inline x.
524--
525-- For now we do nothing, because this would require putting
526-- everything inside UniqSM.
527--
528-- One more variant of this (#7366):
529--
530--   [ y = e, y = z ]
531--
532-- If we don't want to inline y = e, because y is used many times, we
533-- might still be tempted to inline y = z (because we always inline
534-- trivial rhs's).  But of course we can't, because y is equal to e,
535-- not z.
536
537-- Note [discard during inlining]
538-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
539-- Opportunities to discard assignments sometimes appear after we've
540-- done some inlining.  Here's an example:
541--
542--      x = R1;
543--      y = P64[x + 7];
544--      z = P64[x + 15];
545--      /* z is dead */
546--      R1 = y & (-8);
547--
548-- The x assignment is trivial, so we inline it in the RHS of y, and
549-- keep both x and y.  z gets dropped because it is dead, then we
550-- inline y, and we have a dead assignment to x.  If we don't notice
551-- that x is dead in tryToInline, we end up retaining it.
552
553addUsage :: UniqFM Int -> LocalReg -> UniqFM Int
554addUsage m r = addToUFM_C (+) m r 1
555
556regsUsedIn :: LRegSet -> CmmExpr -> Bool
557regsUsedIn ls _ | nullLRegSet ls = False
558regsUsedIn ls e = wrapRecExpf f e False
559  where f (CmmReg (CmmLocal l))      _ | l `elemLRegSet` ls = True
560        f (CmmRegOff (CmmLocal l) _) _ | l `elemLRegSet` ls = True
561        f _ z = z
562
563-- we don't inline into CmmUnsafeForeignCall if the expression refers
564-- to global registers.  This is a HACK to avoid global registers
565-- clashing with C argument-passing registers, really the back-end
566-- ought to be able to handle it properly, but currently neither PprC
567-- nor the NCG can do it.  See Note [Register parameter passing]
568-- See also GHC.StgToCmm.Foreign.load_args_into_temps.
569okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
570okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
571    not (globalRegistersConflict dflags expr node)
572okToInline _ _ _ = True
573
574-- -----------------------------------------------------------------------------
575
576-- | @conflicts (r,e) node@ is @False@ if and only if the assignment
577-- @r = e@ can be safely commuted past statement @node@.
578conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
579conflicts dflags (r, rhs, addr) node
580
581  -- (1) node defines registers used by rhs of assignment. This catches
582  -- assignments and all three kinds of calls. See Note [Sinking and calls]
583  | globalRegistersConflict dflags rhs node                       = True
584  | localRegistersConflict  dflags rhs node                       = True
585
586  -- (2) node uses register defined by assignment
587  | foldRegsUsed dflags (\b r' -> r == r' || b) False node        = True
588
589  -- (3) a store to an address conflicts with a read of the same memory
590  | CmmStore addr' e <- node
591  , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
592
593  -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
594  | HeapMem    <- addr, CmmAssign (CmmGlobal Hp) _ <- node        = True
595  | StackMem   <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
596  | SpMem{}    <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
597
598  -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
599  | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem      = True
600
601  -- (6) native calls clobber any memory
602  | CmmCall{} <- node, memConflicts addr AnyMem                   = True
603
604  -- (7) otherwise, no conflict
605  | otherwise = False
606
607-- Returns True if node defines any global registers that are used in the
608-- Cmm expression
609globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
610globalRegistersConflict dflags expr node =
611    foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmGlobal r) expr)
612                 False node
613
614-- Returns True if node defines any local registers that are used in the
615-- Cmm expression
616localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
617localRegistersConflict dflags expr node =
618    foldRegsDefd dflags (\b r -> b || regUsedIn dflags (CmmLocal  r) expr)
619                 False node
620
621-- Note [Sinking and calls]
622-- ~~~~~~~~~~~~~~~~~~~~~~~~
623--
624-- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
625-- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
626-- stack layout (see Note [Sinking after stack layout]) which leads to two
627-- invariants related to calls:
628--
629--   a) during stack layout phase all safe foreign calls are turned into
630--      unsafe foreign calls (see Note [Lower safe foreign calls]). This
631--      means that we will never encounter CmmForeignCall node when running
632--      sinking after stack layout
633--
634--   b) stack layout saves all variables live across a call on the stack
635--      just before making a call (remember we are not sinking assignments to
636--      stack):
637--
638--       L1:
639--          x = R1
640--          P64[Sp - 16] = L2
641--          P64[Sp - 8]  = x
642--          Sp = Sp - 16
643--          call f() returns L2
644--       L2:
645--
646--      We will attempt to sink { x = R1 } but we will detect conflict with
647--      { P64[Sp - 8]  = x } and hence we will drop { x = R1 } without even
648--      checking whether it conflicts with { call f() }. In this way we will
649--      never need to check any assignment conflicts with CmmCall. Remember
650--      that we still need to check for potential memory conflicts.
651--
652-- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
653-- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
654-- This assumption holds only when we do sinking after stack layout. If we run
655-- it before stack layout we need to check for possible conflicts with all three
656-- kinds of calls. Our `conflicts` function does that by using a generic
657-- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
658-- UserOfRegs typeclasses.
659--
660
661-- An abstraction of memory read or written.
662data AbsMem
663  = NoMem            -- no memory accessed
664  | AnyMem           -- arbitrary memory
665  | HeapMem          -- definitely heap memory
666  | StackMem         -- definitely stack memory
667  | SpMem            -- <size>[Sp+n]
668       {-# UNPACK #-} !Int
669       {-# UNPACK #-} !Int
670
671-- Having SpMem is important because it lets us float loads from Sp
672-- past stores to Sp as long as they don't overlap, and this helps to
673-- unravel some long sequences of
674--    x1 = [Sp + 8]
675--    x2 = [Sp + 16]
676--    ...
677--    [Sp + 8]  = xi
678--    [Sp + 16] = xj
679--
680-- Note that SpMem is invalidated if Sp is changed, but the definition
681-- of 'conflicts' above handles that.
682
683-- ToDo: this won't currently fix the following commonly occurring code:
684--    x1 = [R1 + 8]
685--    x2 = [R1 + 16]
686--    ..
687--    [Hp - 8] = x1
688--    [Hp - 16] = x2
689--    ..
690
691-- because [R1 + 8] and [Hp - 8] are both HeapMem.  We know that
692-- assignments to [Hp + n] do not conflict with any other heap memory,
693-- but this is tricky to nail down.  What if we had
694--
695--   x = Hp + n
696--   [x] = ...
697--
698--  the store to [x] should be "new heap", not "old heap".
699--  Furthermore, you could imagine that if we started inlining
700--  functions in Cmm then there might well be reads of heap memory
701--  that was written in the same basic block.  To take advantage of
702--  non-aliasing of heap memory we will have to be more clever.
703
704-- Note [Foreign calls clobber heap]
705-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
706--
707-- It is tempting to say that foreign calls clobber only
708-- non-heap/stack memory, but unfortunately we break this invariant in
709-- the RTS.  For example, in stg_catch_retry_frame we call
710-- stmCommitNestedTransaction() which modifies the contents of the
711-- TRec it is passed (this actually caused incorrect code to be
712-- generated).
713--
714-- Since the invariant is true for the majority of foreign calls,
715-- perhaps we ought to have a special annotation for calls that can
716-- modify heap/stack memory.  For now we just use the conservative
717-- definition here.
718--
719-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
720-- therefore we should never float any memory operations across one of
721-- these calls.
722
723
724bothMems :: AbsMem -> AbsMem -> AbsMem
725bothMems NoMem    x         = x
726bothMems x        NoMem     = x
727bothMems HeapMem  HeapMem   = HeapMem
728bothMems StackMem StackMem     = StackMem
729bothMems (SpMem o1 w1) (SpMem o2 w2)
730  | o1 == o2  = SpMem o1 (max w1 w2)
731  | otherwise = StackMem
732bothMems SpMem{}  StackMem  = StackMem
733bothMems StackMem SpMem{}   = StackMem
734bothMems _         _        = AnyMem
735
736memConflicts :: AbsMem -> AbsMem -> Bool
737memConflicts NoMem      _          = False
738memConflicts _          NoMem      = False
739memConflicts HeapMem    StackMem   = False
740memConflicts StackMem   HeapMem    = False
741memConflicts SpMem{}    HeapMem    = False
742memConflicts HeapMem    SpMem{}    = False
743memConflicts (SpMem o1 w1) (SpMem o2 w2)
744  | o1 < o2   = o1 + w1 > o2
745  | otherwise = o2 + w2 > o1
746memConflicts _         _         = True
747
748exprMem :: DynFlags -> CmmExpr -> AbsMem
749exprMem dflags (CmmLoad addr w)  = bothMems (loadAddr dflags addr (typeWidth w)) (exprMem dflags addr)
750exprMem dflags (CmmMachOp _ es)  = foldr bothMems NoMem (map (exprMem dflags) es)
751exprMem _      _                 = NoMem
752
753loadAddr :: DynFlags -> CmmExpr -> Width -> AbsMem
754loadAddr dflags e w =
755  case e of
756   CmmReg r       -> regAddr dflags r 0 w
757   CmmRegOff r i  -> regAddr dflags r i w
758   _other | regUsedIn dflags spReg e -> StackMem
759          | otherwise -> AnyMem
760
761regAddr :: DynFlags -> CmmReg -> Int -> Width -> AbsMem
762regAddr _      (CmmGlobal Sp) i w = SpMem i (widthInBytes w)
763regAddr _      (CmmGlobal Hp) _ _ = HeapMem
764regAddr _      (CmmGlobal CurrentTSO) _ _ = HeapMem -- important for PrimOps
765regAddr dflags r _ _ | isGcPtrType (cmmRegType dflags r) = HeapMem -- yay! GCPtr pays for itself
766regAddr _      _ _ _ = AnyMem
767
768{-
769Note [Inline GlobalRegs?]
770
771Should we freely inline GlobalRegs?
772
773Actually it doesn't make a huge amount of difference either way, so we
774*do* currently treat GlobalRegs as "trivial" and inline them
775everywhere, but for what it's worth, here is what I discovered when I
776(SimonM) looked into this:
777
778Common sense says we should not inline GlobalRegs, because when we
779have
780
781  x = R1
782
783the register allocator will coalesce this assignment, generating no
784code, and simply record the fact that x is bound to $rbx (or
785whatever).  Furthermore, if we were to sink this assignment, then the
786range of code over which R1 is live increases, and the range of code
787over which x is live decreases.  All things being equal, it is better
788for x to be live than R1, because R1 is a fixed register whereas x can
789live in any register.  So we should neither sink nor inline 'x = R1'.
790
791However, not inlining GlobalRegs can have surprising
792consequences. e.g. (cgrun020)
793
794  c3EN:
795      _s3DB::P64 = R1;
796      _c3ES::P64 = _s3DB::P64 & 7;
797      if (_c3ES::P64 >= 2) goto c3EU; else goto c3EV;
798  c3EU:
799      _s3DD::P64 = P64[_s3DB::P64 + 6];
800      _s3DE::P64 = P64[_s3DB::P64 + 14];
801      I64[Sp - 8] = c3F0;
802      R1 = _s3DE::P64;
803      P64[Sp] = _s3DD::P64;
804
805inlining the GlobalReg gives:
806
807  c3EN:
808      if (R1 & 7 >= 2) goto c3EU; else goto c3EV;
809  c3EU:
810      I64[Sp - 8] = c3F0;
811      _s3DD::P64 = P64[R1 + 6];
812      R1 = P64[R1 + 14];
813      P64[Sp] = _s3DD::P64;
814
815but if we don't inline the GlobalReg, instead we get:
816
817      _s3DB::P64 = R1;
818      if (_s3DB::P64 & 7 >= 2) goto c3EU; else goto c3EV;
819  c3EU:
820      I64[Sp - 8] = c3F0;
821      R1 = P64[_s3DB::P64 + 14];
822      P64[Sp] = P64[_s3DB::P64 + 6];
823
824This looks better - we managed to inline _s3DD - but in fact it
825generates an extra reg-reg move:
826
827.Lc3EU:
828        movq $c3F0_info,-8(%rbp)
829        movq %rbx,%rax
830        movq 14(%rbx),%rbx
831        movq 6(%rax),%rax
832        movq %rax,(%rbp)
833
834because _s3DB is now live across the R1 assignment, we lost the
835benefit of coalescing.
836
837Who is at fault here?  Perhaps if we knew that _s3DB was an alias for
838R1, then we would not sink a reference to _s3DB past the R1
839assignment.  Or perhaps we *should* do that - we might gain by sinking
840it, despite losing the coalescing opportunity.
841
842Sometimes not inlining global registers wins by virtue of the rule
843about not inlining into arguments of a foreign call, e.g. (T7163) this
844is what happens when we inlined F1:
845
846      _s3L2::F32 = F1;
847      _c3O3::F32 = %MO_F_Mul_W32(F1, 10.0 :: W32);
848      (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(_c3O3::F32);
849
850but if we don't inline F1:
851
852      (_s3L7::F32) = call "ccall" arg hints:  []  result hints:  [] rintFloat(%MO_F_Mul_W32(_s3L2::F32,
853                                                                                            10.0 :: W32));
854-}
855