1{-
2(c) The University of Glasgow 2006
3(c) The AQUA Project, Glasgow University, 1994-1998
4
5
6Core-syntax unfoldings
7
8Unfoldings (which can travel across module boundaries) are in Core
9syntax (namely @CoreExpr@s).
10
11The type @Unfolding@ sits ``above'' simply-Core-expressions
12unfoldings, capturing ``higher-level'' things we know about a binding,
13usually things that the simplifier found out (e.g., ``it's a
14literal'').  In the corner of a @CoreUnfolding@ unfolding, you will
15find, unsurprisingly, a Core expression.
16-}
17
18{-# LANGUAGE CPP #-}
19{-# LANGUAGE BangPatterns #-}
20
21{-# OPTIONS_GHC -Wno-incomplete-record-updates #-}
22
23module GHC.Core.Unfold (
24        Unfolding, UnfoldingGuidance,   -- Abstract types
25
26        UnfoldingOpts (..), defaultUnfoldingOpts,
27        updateCreationThreshold, updateUseThreshold,
28        updateFunAppDiscount, updateDictDiscount,
29        updateVeryAggressive, updateCaseScaling, updateCaseThreshold,
30
31        ArgSummary(..),
32
33        couldBeSmallEnoughToInline, inlineBoringOk,
34        certainlyWillInline, smallEnoughToInline,
35
36        callSiteInline, CallCtxt(..),
37        calcUnfoldingGuidance
38    ) where
39
40#include "GhclibHsVersions.h"
41
42import GHC.Prelude
43
44import GHC.Driver.Session
45import GHC.Driver.Ppr
46import GHC.Core
47import GHC.Core.Utils
48import GHC.Types.Id
49import GHC.Types.Demand ( isDeadEndSig )
50import GHC.Core.DataCon
51import GHC.Types.Literal
52import GHC.Builtin.PrimOps
53import GHC.Types.Id.Info
54import GHC.Types.Basic  ( Arity, InlineSpec(..), inlinePragmaSpec )
55import GHC.Core.Type
56import GHC.Builtin.Names
57import GHC.Builtin.Types.Prim ( realWorldStatePrimTy )
58import GHC.Data.Bag
59import GHC.Utils.Logger
60import GHC.Utils.Misc
61import GHC.Utils.Outputable
62import GHC.Types.ForeignCall
63import GHC.Types.Name
64import GHC.Types.Tickish
65
66import qualified Data.ByteString as BS
67import Data.List (isPrefixOf)
68
69
70-- | Unfolding options
71data UnfoldingOpts = UnfoldingOpts
72   { unfoldingCreationThreshold :: !Int
73      -- ^ Threshold above which unfoldings are not *created*
74
75   , unfoldingUseThreshold :: !Int
76      -- ^ Threshold above which unfoldings are not *inlined*
77
78   , unfoldingFunAppDiscount :: !Int
79      -- ^ Discount for lambdas that are used (applied)
80
81   , unfoldingDictDiscount :: !Int
82      -- ^ Discount for dictionaries
83
84   , unfoldingVeryAggressive :: !Bool
85      -- ^ Force inlining in many more cases
86
87      -- Don't consider depth up to x
88   , unfoldingCaseThreshold :: !Int
89
90      -- Penalize depth with 1/x
91   , unfoldingCaseScaling :: !Int
92   }
93
94defaultUnfoldingOpts :: UnfoldingOpts
95defaultUnfoldingOpts = UnfoldingOpts
96   { unfoldingCreationThreshold = 750
97      -- The unfoldingCreationThreshold threshold must be reasonably high
98      -- to take account of possible discounts.
99      -- E.g. 450 is not enough in 'fulsom' for Interval.sqr to
100      -- inline into Csg.calc (The unfolding for sqr never makes it
101      -- into the interface file.)
102
103   , unfoldingUseThreshold   = 90
104      -- Last adjusted upwards in #18282, when I reduced
105      -- the result discount for constructors.
106
107   , unfoldingFunAppDiscount = 60
108      -- Be fairly keen to inline a function if that means
109      -- we'll be able to pick the right method from a dictionary
110
111   , unfoldingDictDiscount   = 30
112      -- Be fairly keen to inline a function if that means
113      -- we'll be able to pick the right method from a dictionary
114
115   , unfoldingVeryAggressive = False
116
117      -- Only apply scaling once we are deeper than threshold cases
118      -- in an RHS.
119   , unfoldingCaseThreshold = 2
120
121      -- Penalize depth with (size*depth)/scaling
122   , unfoldingCaseScaling = 30
123   }
124
125-- Helpers for "GHC.Driver.Session"
126
127updateCreationThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
128updateCreationThreshold n opts = opts { unfoldingCreationThreshold = n }
129
130updateUseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
131updateUseThreshold n opts = opts { unfoldingUseThreshold = n }
132
133updateFunAppDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts
134updateFunAppDiscount n opts = opts { unfoldingFunAppDiscount = n }
135
136updateDictDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts
137updateDictDiscount n opts = opts { unfoldingDictDiscount = n }
138
139updateVeryAggressive :: Bool -> UnfoldingOpts -> UnfoldingOpts
140updateVeryAggressive n opts = opts { unfoldingVeryAggressive = n }
141
142
143updateCaseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts
144updateCaseThreshold n opts = opts { unfoldingCaseThreshold = n }
145
146updateCaseScaling :: Int -> UnfoldingOpts -> UnfoldingOpts
147updateCaseScaling n opts = opts { unfoldingCaseScaling = n }
148
149{-
150Note [Occurrence analysis of unfoldings]
151~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
152We do occurrence-analysis of unfoldings once and for all, when the
153unfolding is built, rather than each time we inline them.
154
155But given this decision it's vital that we do
156*always* do it.  Consider this unfolding
157    \x -> letrec { f = ...g...; g* = f } in body
158where g* is (for some strange reason) the loop breaker.  If we don't
159occ-anal it when reading it in, we won't mark g as a loop breaker, and
160we may inline g entirely in body, dropping its binding, and leaving
161the occurrence in f out of scope. This happened in #8892, where
162the unfolding in question was a DFun unfolding.
163
164But more generally, the simplifier is designed on the
165basis that it is looking at occurrence-analysed expressions, so better
166ensure that they actually are.
167
168Note [Calculate unfolding guidance on the non-occ-anal'd expression]
169~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
170Notice that we give the non-occur-analysed expression to
171calcUnfoldingGuidance.  In some ways it'd be better to occur-analyse
172first; for example, sometimes during simplification, there's a large
173let-bound thing which has been substituted, and so is now dead; so
174'expr' contains two copies of the thing while the occurrence-analysed
175expression doesn't.
176
177Nevertheless, we *don't* and *must not* occ-analyse before computing
178the size because
179
180a) The size computation bales out after a while, whereas occurrence
181   analysis does not.
182
183b) Residency increases sharply if you occ-anal first.  I'm not
184   100% sure why, but it's a large effect.  Compiling Cabal went
185   from residency of 534M to over 800M with this one change.
186
187This can occasionally mean that the guidance is very pessimistic;
188it gets fixed up next round.  And it should be rare, because large
189let-bound things that are dead are usually caught by preInlineUnconditionally
190
191
192************************************************************************
193*                                                                      *
194\subsection{The UnfoldingGuidance type}
195*                                                                      *
196************************************************************************
197-}
198
199inlineBoringOk :: CoreExpr -> Bool
200-- See Note [INLINE for small functions]
201-- True => the result of inlining the expression is
202--         no bigger than the expression itself
203--     eg      (\x y -> f y x)
204-- This is a quick and dirty version. It doesn't attempt
205-- to deal with  (\x y z -> x (y z))
206-- The really important one is (x `cast` c)
207inlineBoringOk e
208  = go 0 e
209  where
210    go :: Int -> CoreExpr -> Bool
211    go credit (Lam x e) | isId x           = go (credit+1) e
212                        | otherwise        = go credit e
213        -- See Note [Count coercion arguments in boring contexts]
214    go credit (App f (Type {}))            = go credit f
215    go credit (App f a) | credit > 0
216                        , exprIsTrivial a  = go (credit-1) f
217    go credit (Tick _ e)                   = go credit e -- dubious
218    go credit (Cast e _)                   = go credit e
219    go credit (Case scrut _ _ [Alt _ _ rhs]) -- See Note [Inline unsafeCoerce]
220      | isUnsafeEqualityProof scrut        = go credit rhs
221    go _      (Var {})                     = boringCxtOk
222    go _      _                            = boringCxtNotOk
223
224calcUnfoldingGuidance
225        :: UnfoldingOpts
226        -> Bool          -- Definitely a top-level, bottoming binding
227        -> CoreExpr      -- Expression to look at
228        -> UnfoldingGuidance
229calcUnfoldingGuidance opts is_top_bottoming (Tick t expr)
230  | not (tickishIsCode t)  -- non-code ticks don't matter for unfolding
231  = calcUnfoldingGuidance opts is_top_bottoming expr
232calcUnfoldingGuidance opts is_top_bottoming expr
233  = case sizeExpr opts bOMB_OUT_SIZE val_bndrs body of
234      TooBig -> UnfNever
235      SizeIs size cased_bndrs scrut_discount
236        | uncondInline expr n_val_bndrs size
237        -> UnfWhen { ug_unsat_ok = unSaturatedOk
238                   , ug_boring_ok =  boringCxtOk
239                   , ug_arity = n_val_bndrs }   -- Note [INLINE for small functions]
240
241        | is_top_bottoming
242        -> UnfNever   -- See Note [Do not inline top-level bottoming functions]
243
244        | otherwise
245        -> UnfIfGoodArgs { ug_args  = map (mk_discount cased_bndrs) val_bndrs
246                         , ug_size  = size
247                         , ug_res   = scrut_discount }
248
249  where
250    (bndrs, body) = collectBinders expr
251    bOMB_OUT_SIZE = unfoldingCreationThreshold opts
252           -- Bomb out if size gets bigger than this
253    val_bndrs   = filter isId bndrs
254    n_val_bndrs = length val_bndrs
255
256    mk_discount :: Bag (Id,Int) -> Id -> Int
257    mk_discount cbs bndr = foldl' combine 0 cbs
258           where
259             combine acc (bndr', disc)
260               | bndr == bndr' = acc `plus_disc` disc
261               | otherwise     = acc
262
263             plus_disc :: Int -> Int -> Int
264             plus_disc | isFunTy (idType bndr) = max
265                       | otherwise             = (+)
266             -- See Note [Function and non-function discounts]
267
268{- Note [Inline unsafeCoerce]
269~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
270We really want to inline unsafeCoerce, even when applied to boring
271arguments.  It doesn't look as if its RHS is smaller than the call
272   unsafeCoerce x = case unsafeEqualityProof @a @b of UnsafeRefl -> x
273but that case is discarded -- see Note [Implementing unsafeCoerce]
274in base:Unsafe.Coerce.
275
276Moreover, if we /don't/ inline it, we may be left with
277          f (unsafeCoerce x)
278which will build a thunk -- bad, bad, bad.
279
280Conclusion: we really want inlineBoringOk to be True of the RHS of
281unsafeCoerce.  This is (U4) in Note [Implementing unsafeCoerce].
282
283Note [Computing the size of an expression]
284~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
285The basic idea of sizeExpr is obvious enough: count nodes.  But getting the
286heuristics right has taken a long time.  Here's the basic strategy:
287
288    * Variables, literals: 0
289      (Exception for string literals, see litSize.)
290
291    * Function applications (f e1 .. en): 1 + #value args
292
293    * Constructor applications: 1, regardless of #args
294
295    * Let(rec): 1 + size of components
296
297    * Note, cast: 0
298
299Examples
300
301  Size  Term
302  --------------
303    0     42#
304    0     x
305    0     True
306    2     f x
307    1     Just x
308    4     f (g x)
309
310Notice that 'x' counts 0, while (f x) counts 2.  That's deliberate: there's
311a function call to account for.  Notice also that constructor applications
312are very cheap, because exposing them to a caller is so valuable.
313
314[25/5/11] All sizes are now multiplied by 10, except for primops
315(which have sizes like 1 or 4.  This makes primops look fantastically
316cheap, and seems to be almost universally beneficial.  Done partly as a
317result of #4978.
318
319Note [Do not inline top-level bottoming functions]
320~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
321The FloatOut pass has gone to some trouble to float out calls to 'error'
322and similar friends.  See Note [Bottoming floats] in GHC.Core.Opt.SetLevels.
323Do not re-inline them!  But we *do* still inline if they are very small
324(the uncondInline stuff).
325
326Note [INLINE for small functions]
327~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
328Consider        {-# INLINE f #-}
329                f x = Just x
330                g y = f y
331Then f's RHS is no larger than its LHS, so we should inline it into
332even the most boring context.  In general, f the function is
333sufficiently small that its body is as small as the call itself, the
334inline unconditionally, regardless of how boring the context is.
335
336Things to note:
337
338(1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
339    than the thing it's replacing.  Notice that
340      (f x) --> (g 3)             -- YES, unconditionally
341      (f x) --> x : []            -- YES, *even though* there are two
342                                  --      arguments to the cons
343      x     --> g 3               -- NO
344      x     --> Just v            -- NO
345
346    It's very important not to unconditionally replace a variable by
347    a non-atomic term.
348
349(2) We do this even if the thing isn't saturated, else we end up with the
350    silly situation that
351       f x y = x
352       ...map (f 3)...
353    doesn't inline.  Even in a boring context, inlining without being
354    saturated will give a lambda instead of a PAP, and will be more
355    efficient at runtime.
356
357(3) However, when the function's arity > 0, we do insist that it
358    has at least one value argument at the call site.  (This check is
359    made in the UnfWhen case of callSiteInline.) Otherwise we find this:
360         f = /\a \x:a. x
361         d = /\b. MkD (f b)
362    If we inline f here we get
363         d = /\b. MkD (\x:b. x)
364    and then prepareRhs floats out the argument, abstracting the type
365    variables, so we end up with the original again!
366
367(4) We must be much more cautious about arity-zero things. Consider
368       let x = y +# z in ...
369    In *size* terms primops look very small, because the generate a
370    single instruction, but we do not want to unconditionally replace
371    every occurrence of x with (y +# z).  So we only do the
372    unconditional-inline thing for *trivial* expressions.
373
374    NB: you might think that PostInlineUnconditionally would do this
375    but it doesn't fire for top-level things; see GHC.Core.Opt.Simplify.Utils
376    Note [Top level and postInlineUnconditionally]
377
378Note [Count coercion arguments in boring contexts]
379~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
380In inlineBoringOK, we ignore type arguments when deciding whether an
381expression is okay to inline into boring contexts. This is good, since
382if we have a definition like
383
384  let y = x @Int in f y y
385
386there’s no reason not to inline y at both use sites — no work is
387actually duplicated. It may seem like the same reasoning applies to
388coercion arguments, and indeed, in #17182 we changed inlineBoringOK to
389treat coercions the same way.
390
391However, this isn’t a good idea: unlike type arguments, which have
392no runtime representation, coercion arguments *do* have a runtime
393representation (albeit the zero-width VoidRep, see Note [Coercion tokens]
394in "GHC.CoreToStg"). This caused trouble in #17787 for DataCon wrappers for
395nullary GADT constructors: the wrappers would be inlined and each use of
396the constructor would lead to a separate allocation instead of just
397sharing the wrapper closure.
398
399The solution: don’t ignore coercion arguments after all.
400-}
401
402uncondInline :: CoreExpr -> Arity -> Int -> Bool
403-- Inline unconditionally if there no size increase
404-- Size of call is arity (+1 for the function)
405-- See Note [INLINE for small functions]
406uncondInline rhs arity size
407  | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
408  | otherwise = exprIsTrivial rhs        -- See Note [INLINE for small functions] (4)
409
410sizeExpr :: UnfoldingOpts
411         -> Int             -- Bomb out if it gets bigger than this
412         -> [Id]            -- Arguments; we're interested in which of these
413                            -- get case'd
414         -> CoreExpr
415         -> ExprSize
416
417-- Note [Computing the size of an expression]
418
419-- Forcing bOMB_OUT_SIZE early prevents repeated
420-- unboxing of the Int argument.
421sizeExpr opts !bOMB_OUT_SIZE top_args expr
422  = size_up expr
423  where
424    size_up (Cast e _) = size_up e
425    size_up (Tick _ e) = size_up e
426    size_up (Type _)   = sizeZero           -- Types cost nothing
427    size_up (Coercion _) = sizeZero
428    size_up (Lit lit)  = sizeN (litSize lit)
429    size_up (Var f) | isRealWorldId f = sizeZero
430                      -- Make sure we get constructor discounts even
431                      -- on nullary constructors
432                    | otherwise       = size_up_call f [] 0
433
434    size_up (App fun arg)
435      | isTyCoArg arg = size_up fun
436      | otherwise     = size_up arg  `addSizeNSD`
437                        size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
438
439    size_up (Lam b e)
440      | isId b && not (isRealWorldId b) = lamScrutDiscount opts (size_up e `addSizeN` 10)
441      | otherwise = size_up e
442
443    size_up (Let (NonRec binder rhs) body)
444      = size_up_rhs (binder, rhs) `addSizeNSD`
445        size_up body              `addSizeN`
446        size_up_alloc binder
447
448    size_up (Let (Rec pairs) body)
449      = foldr (addSizeNSD . size_up_rhs)
450              (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
451              pairs
452
453    size_up (Case e _ _ alts)
454        | null alts
455        = size_up e    -- case e of {} never returns, so take size of scrutinee
456
457    size_up (Case e _ _ alts)
458        -- Now alts is non-empty
459        | Just v <- is_top_arg e -- We are scrutinising an argument variable
460        = let
461            alt_sizes = map size_up_alt alts
462
463                  -- alts_size tries to compute a good discount for
464                  -- the case when we are scrutinising an argument variable
465            alts_size (SizeIs tot tot_disc tot_scrut)
466                          -- Size of all alternatives
467                      (SizeIs max _        _)
468                          -- Size of biggest alternative
469                  = SizeIs tot (unitBag (v, 20 + tot - max)
470                      `unionBags` tot_disc) tot_scrut
471                          -- If the variable is known, we produce a
472                          -- discount that will take us back to 'max',
473                          -- the size of the largest alternative The
474                          -- 1+ is a little discount for reduced
475                          -- allocation in the caller
476                          --
477                          -- Notice though, that we return tot_disc,
478                          -- the total discount from all branches.  I
479                          -- think that's right.
480
481            alts_size tot_size _ = tot_size
482          in
483          alts_size (foldr1 addAltSize alt_sizes)  -- alts is non-empty
484                    (foldr1 maxSize    alt_sizes)
485                -- Good to inline if an arg is scrutinised, because
486                -- that may eliminate allocation in the caller
487                -- And it eliminates the case itself
488        where
489          is_top_arg (Var v) | v `elem` top_args = Just v
490          is_top_arg (Cast e _) = is_top_arg e
491          is_top_arg _ = Nothing
492
493
494    size_up (Case e _ _ alts) = size_up e  `addSizeNSD`
495                                foldr (addAltSize . size_up_alt) case_size alts
496      where
497          case_size
498           | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
499           | otherwise = sizeZero
500                -- Normally we don't charge for the case itself, but
501                -- we charge one per alternative (see size_up_alt,
502                -- below) to account for the cost of the info table
503                -- and comparisons.
504                --
505                -- However, in certain cases (see is_inline_scrut
506                -- below), no code is generated for the case unless
507                -- there are multiple alts.  In these cases we
508                -- subtract one, making the first alt free.
509                -- e.g. case x# +# y# of _ -> ...   should cost 1
510                --      case touch# x# of _ -> ...  should cost 0
511                -- (see #4978)
512                --
513                -- I would like to not have the "lengthAtMost alts 1"
514                -- condition above, but without that some programs got worse
515                -- (spectral/hartel/event and spectral/para).  I don't fully
516                -- understand why. (SDM 24/5/11)
517
518                -- unboxed variables, inline primops and unsafe foreign calls
519                -- are all "inline" things:
520          is_inline_scrut (Var v) = isUnliftedType (idType v)
521          is_inline_scrut scrut
522              | (Var f, _) <- collectArgs scrut
523                = case idDetails f of
524                    FCallId fc  -> not (isSafeForeignCall fc)
525                    PrimOpId op -> not (primOpOutOfLine op)
526                    _other      -> False
527              | otherwise
528                = False
529
530    size_up_rhs (bndr, rhs)
531      | Just join_arity <- isJoinId_maybe bndr
532        -- Skip arguments to join point
533      , (_bndrs, body) <- collectNBinders join_arity rhs
534      = size_up body
535      | otherwise
536      = size_up rhs
537
538    ------------
539    -- size_up_app is used when there's ONE OR MORE value args
540    size_up_app (App fun arg) args voids
541        | isTyCoArg arg                  = size_up_app fun args voids
542        | isRealWorldExpr arg            = size_up_app fun (arg:args) (voids + 1)
543        | otherwise                      = size_up arg  `addSizeNSD`
544                                           size_up_app fun (arg:args) voids
545    size_up_app (Var fun)     args voids = size_up_call fun args voids
546    size_up_app (Tick _ expr) args voids = size_up_app expr args voids
547    size_up_app (Cast expr _) args voids = size_up_app expr args voids
548    size_up_app other         args voids = size_up other `addSizeN`
549                                           callSize (length args) voids
550       -- if the lhs is not an App or a Var, or an invisible thing like a
551       -- Tick or Cast, then we should charge for a complete call plus the
552       -- size of the lhs itself.
553
554    ------------
555    size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
556    size_up_call fun val_args voids
557       = case idDetails fun of
558           FCallId _        -> sizeN (callSize (length val_args) voids)
559           DataConWorkId dc -> conSize    dc (length val_args)
560           PrimOpId op      -> primOpSize op (length val_args)
561           ClassOpId _      -> classOpSize opts top_args val_args
562           _                -> funSize opts top_args fun (length val_args) voids
563
564    ------------
565    size_up_alt (Alt _con _bndrs rhs) = size_up rhs `addSizeN` 10
566        -- Don't charge for args, so that wrappers look cheap
567        -- (See comments about wrappers with Case)
568        --
569        -- IMPORTANT: *do* charge 1 for the alternative, else we
570        -- find that giant case nests are treated as practically free
571        -- A good example is Foreign.C.Error.errnoToIOError
572
573    ------------
574    -- Cost to allocate binding with given binder
575    size_up_alloc bndr
576      |  isTyVar bndr                 -- Doesn't exist at runtime
577      || isJoinId bndr                -- Not allocated at all
578      || isUnliftedType (idType bndr) -- Doesn't live in heap
579      = 0
580      | otherwise
581      = 10
582
583    ------------
584        -- These addSize things have to be here because
585        -- I don't want to give them bOMB_OUT_SIZE as an argument
586    addSizeN TooBig          _  = TooBig
587    addSizeN (SizeIs n xs d) m  = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
588
589        -- addAltSize is used to add the sizes of case alternatives
590    addAltSize TooBig            _      = TooBig
591    addAltSize _                 TooBig = TooBig
592    addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
593        = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
594                                 (xs `unionBags` ys)
595                                 (d1 + d2) -- Note [addAltSize result discounts]
596
597        -- This variant ignores the result discount from its LEFT argument
598        -- It's used when the second argument isn't part of the result
599    addSizeNSD TooBig            _      = TooBig
600    addSizeNSD _                 TooBig = TooBig
601    addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
602        = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
603                                 (xs `unionBags` ys)
604                                 d2  -- Ignore d1
605
606    isRealWorldId id = idType id `eqType` realWorldStatePrimTy
607
608    -- an expression of type State# RealWorld must be a variable
609    isRealWorldExpr (Var id)   = isRealWorldId id
610    isRealWorldExpr (Tick _ e) = isRealWorldExpr e
611    isRealWorldExpr _          = False
612
613-- | Finds a nominal size of a string literal.
614litSize :: Literal -> Int
615-- Used by GHC.Core.Unfold.sizeExpr
616litSize (LitNumber LitNumInteger _) = 100   -- Note [Size of literal integers]
617litSize (LitNumber LitNumNatural _) = 100
618litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
619        -- If size could be 0 then @f "x"@ might be too small
620        -- [Sept03: make literal strings a bit bigger to avoid fruitless
621        --  duplication of little strings]
622litSize _other = 0    -- Must match size of nullary constructors
623                      -- Key point: if  x |-> 4, then x must inline unconditionally
624                      --            (eg via case binding)
625
626classOpSize :: UnfoldingOpts -> [Id] -> [CoreExpr] -> ExprSize
627-- See Note [Conlike is interesting]
628classOpSize _ _ []
629  = sizeZero
630classOpSize opts top_args (arg1 : other_args)
631  = SizeIs size arg_discount 0
632  where
633    size = 20 + (10 * length other_args)
634    -- If the class op is scrutinising a lambda bound dictionary then
635    -- give it a discount, to encourage the inlining of this function
636    -- The actual discount is rather arbitrarily chosen
637    arg_discount = case arg1 of
638                     Var dict | dict `elem` top_args
639                              -> unitBag (dict, unfoldingDictDiscount opts)
640                     _other   -> emptyBag
641
642-- | The size of a function call
643callSize
644 :: Int  -- ^ number of value args
645 -> Int  -- ^ number of value args that are void
646 -> Int
647callSize n_val_args voids = 10 * (1 + n_val_args - voids)
648        -- The 1+ is for the function itself
649        -- Add 1 for each non-trivial arg;
650        -- the allocation cost, as in let(rec)
651
652-- | The size of a jump to a join point
653jumpSize
654 :: Int  -- ^ number of value args
655 -> Int  -- ^ number of value args that are void
656 -> Int
657jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
658  -- A jump is 20% the size of a function call. Making jumps free reopens
659  -- bug #6048, but making them any more expensive loses a 21% improvement in
660  -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
661  -- better solution?
662
663funSize :: UnfoldingOpts -> [Id] -> Id -> Int -> Int -> ExprSize
664-- Size for functions that are not constructors or primops
665-- Note [Function applications]
666funSize opts top_args fun n_val_args voids
667  | fun `hasKey` buildIdKey   = buildSize
668  | fun `hasKey` augmentIdKey = augmentSize
669  | otherwise = SizeIs size arg_discount res_discount
670  where
671    some_val_args = n_val_args > 0
672    is_join = isJoinId fun
673
674    size | is_join              = jumpSize n_val_args voids
675         | not some_val_args    = 0
676         | otherwise            = callSize n_val_args voids
677
678        --                  DISCOUNTS
679        --  See Note [Function and non-function discounts]
680    arg_discount | some_val_args && fun `elem` top_args
681                 = unitBag (fun, unfoldingFunAppDiscount opts)
682                 | otherwise = emptyBag
683        -- If the function is an argument and is applied
684        -- to some values, give it an arg-discount
685
686    res_discount | idArity fun > n_val_args = unfoldingFunAppDiscount opts
687                 | otherwise                = 0
688        -- If the function is partially applied, show a result discount
689-- XXX maybe behave like ConSize for eval'd variable
690
691conSize :: DataCon -> Int -> ExprSize
692conSize dc n_val_args
693  | n_val_args == 0 = SizeIs 0 emptyBag 10    -- Like variables
694
695-- See Note [Unboxed tuple size and result discount]
696  | isUnboxedTupleDataCon dc = SizeIs 0 emptyBag 10
697
698-- See Note [Constructor size and result discount]
699  | otherwise = SizeIs 10 emptyBag 10
700
701{- Note [Constructor size and result discount]
702~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
703Treat a constructors application as size 10, regardless of how many
704arguments it has; we are keen to expose them (and we charge separately
705for their args).  We can't treat them as size zero, else we find that
706(Just x) has size 0, which is the same as a lone variable; and hence
707'v' will always be replaced by (Just x), where v is bound to Just x.
708
709The "result discount" is applied if the result of the call is
710scrutinised (say by a case).  For a constructor application that will
711mean the constructor application will disappear, so we don't need to
712charge it to the function.  So the discount should at least match the
713cost of the constructor application, namely 10.
714
715Historical note 1: Until Jun 2020 we gave it a "bit of extra
716incentive" via a discount of 10*(1 + n_val_args), but that was FAR too
717much (#18282).  In particular, consider a huge case tree like
718
719   let r = case y1 of
720          Nothing -> B1 a b c
721          Just v1 -> case y2 of
722                      Nothing -> B1 c b a
723                      Just v2 -> ...
724
725If conSize gives a cost of 10 (regardless of n_val_args) and a
726discount of 10, that'll make each alternative RHS cost zero.  We
727charge 10 for each case alternative (see size_up_alt).  If we give a
728bigger discount (say 20) in conSize, we'll make the case expression
729cost *nothing*, and that can make a huge case tree cost nothing. This
730leads to massive, sometimes exponenial inlinings (#18282).  In short,
731don't give a discount that give a negative size to a sub-expression!
732
733Historical note 2: Much longer ago, Simon M tried a MUCH bigger
734discount: (10 * (10 + n_val_args)), and said it was an "unambiguous
735win", but its terribly dangerous because a function with many many
736case branches, each finishing with a constructor, can have an
737arbitrarily large discount.  This led to terrible code bloat: see #6099.
738
739Note [Unboxed tuple size and result discount]
740~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
741However, unboxed tuples count as size zero. I found occasions where we had
742        f x y z = case op# x y z of { s -> (# s, () #) }
743and f wasn't getting inlined.
744
745I tried giving unboxed tuples a *result discount* of zero (see the
746commented-out line).  Why?  When returned as a result they do not
747allocate, so maybe we don't want to charge so much for them. If you
748have a non-zero discount here, we find that workers often get inlined
749back into wrappers, because it look like
750    f x = case $wf x of (# a,b #) -> (a,b)
751and we are keener because of the case.  However while this change
752shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
753more. All other changes were very small. So it's not a big deal but I
754didn't adopt the idea.
755
756When fixing #18282 (see Note [Constructor size and result discount])
757I changed the result discount to be just 10, not 10*(1+n_val_args).
758
759Note [Function and non-function discounts]
760~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
761We want a discount if the function is applied. A good example is
762monadic combinators with continuation arguments, where inlining is
763quite important.
764
765But we don't want a big discount when a function is called many times
766(see the detailed comments with #6048) because if the function is
767big it won't be inlined at its many call sites and no benefit results.
768Indeed, we can get exponentially big inlinings this way; that is what
769#6048 is about.
770
771On the other hand, for data-valued arguments, if there are lots of
772case expressions in the body, each one will get smaller if we apply
773the function to a constructor application, so we *want* a big discount
774if the argument is scrutinised by many case expressions.
775
776Conclusion:
777  - For functions, take the max of the discounts
778  - For data values, take the sum of the discounts
779
780
781Note [Literal integer size]
782~~~~~~~~~~~~~~~~~~~~~~~~~~~
783Literal integers *can* be big (mkInteger [...coefficients...]), but
784need not be (IS n).  We just use an arbitrary big-ish constant here
785so that, in particular, we don't inline top-level defns like
786   n = IS 5
787There's no point in doing so -- any optimisations will see the IS
788through n's unfolding.  Nor will a big size inhibit unfoldings functions
789that mention a literal Integer, because the float-out pass will float
790all those constants to top level.
791-}
792
793primOpSize :: PrimOp -> Int -> ExprSize
794primOpSize op n_val_args
795 = if primOpOutOfLine op
796      then sizeN (op_size + n_val_args)
797      else sizeN op_size
798 where
799   op_size = primOpCodeSize op
800
801
802buildSize :: ExprSize
803buildSize = SizeIs 0 emptyBag 40
804        -- We really want to inline applications of build
805        -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
806        -- Indeed, we should add a result_discount because build is
807        -- very like a constructor.  We don't bother to check that the
808        -- build is saturated (it usually is).  The "-2" discounts for the \c n,
809        -- The "4" is rather arbitrary.
810
811augmentSize :: ExprSize
812augmentSize = SizeIs 0 emptyBag 40
813        -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
814        -- e plus ys. The -2 accounts for the \cn
815
816-- When we return a lambda, give a discount if it's used (applied)
817lamScrutDiscount :: UnfoldingOpts -> ExprSize -> ExprSize
818lamScrutDiscount opts (SizeIs n vs _) = SizeIs n vs (unfoldingFunAppDiscount opts)
819lamScrutDiscount _      TooBig          = TooBig
820
821{-
822Note [addAltSize result discounts]
823~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
824When adding the size of alternatives, we *add* the result discounts
825too, rather than take the *maximum*.  For a multi-branch case, this
826gives a discount for each branch that returns a constructor, making us
827keener to inline.  I did try using 'max' instead, but it makes nofib
828'rewrite' and 'puzzle' allocate significantly more, and didn't make
829binary sizes shrink significantly either.
830
831Note [Discounts and thresholds]
832~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
833
834Constants for discounts and thresholds are defined in 'UnfoldingOpts'. They are:
835
836unfoldingCreationThreshold
837     At a definition site, if the unfolding is bigger than this, we
838     may discard it altogether
839
840unfoldingUseThreshold
841     At a call site, if the unfolding, less discounts, is smaller than
842     this, then it's small enough inline
843
844unfoldingDictDiscount
845     The discount for each occurrence of a dictionary argument
846     as an argument of a class method.  Should be pretty small
847     else big functions may get inlined
848
849unfoldingFunAppDiscount
850     Discount for a function argument that is applied.  Quite
851     large, because if we inline we avoid the higher-order call.
852
853unfoldingVeryAggressive
854     If True, the compiler ignores all the thresholds and inlines very
855     aggressively. It still adheres to arity, simplifier phase control and
856     loop breakers.
857
858
859Historical Note: Before April 2020 we had another factor,
860ufKeenessFactor, which would scale the discounts before they were subtracted
861from the size. This was justified with the following comment:
862
863  -- We multiply the raw discounts (args_discount and result_discount)
864  -- ty opt_UnfoldingKeenessFactor because the former have to do with
865  --  *size* whereas the discounts imply that there's some extra
866  --  *efficiency* to be gained (e.g. beta reductions, case reductions)
867  -- by inlining.
868
869However, this is highly suspect since it means that we subtract a *scaled* size
870from an absolute size, resulting in crazy (e.g. negative) scores in some cases
871(#15304). We consequently killed off ufKeenessFactor and bumped up the
872ufUseThreshold to compensate.
873
874
875Note [Function applications]
876~~~~~~~~~~~~~~~~~~~~~~~~~~~~
877In a function application (f a b)
878
879  - If 'f' is an argument to the function being analysed,
880    and there's at least one value arg, record a FunAppDiscount for f
881
882  - If the application if a PAP (arity > 2 in this example)
883    record a *result* discount (because inlining
884    with "extra" args in the call may mean that we now
885    get a saturated application)
886
887Code for manipulating sizes
888-}
889
890-- | The size of a candidate expression for unfolding
891data ExprSize
892    = TooBig
893    | SizeIs { _es_size_is  :: {-# UNPACK #-} !Int -- ^ Size found
894             , _es_args     :: !(Bag (Id,Int))
895               -- ^ Arguments cased herein, and discount for each such
896             , _es_discount :: {-# UNPACK #-} !Int
897               -- ^ Size to subtract if result is scrutinised by a case
898               -- expression
899             }
900
901instance Outputable ExprSize where
902  ppr TooBig         = text "TooBig"
903  ppr (SizeIs a _ c) = brackets (int a <+> int c)
904
905-- subtract the discount before deciding whether to bale out. eg. we
906-- want to inline a large constructor application into a selector:
907--      tup = (a_1, ..., a_99)
908--      x = case tup of ...
909--
910mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
911mkSizeIs max n xs d | (n - d) > max = TooBig
912                    | otherwise     = SizeIs n xs d
913
914maxSize :: ExprSize -> ExprSize -> ExprSize
915maxSize TooBig         _                                  = TooBig
916maxSize _              TooBig                             = TooBig
917maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2   = s1
918                                              | otherwise = s2
919
920sizeZero :: ExprSize
921sizeN :: Int -> ExprSize
922
923sizeZero = SizeIs 0 emptyBag 0
924sizeN n  = SizeIs n emptyBag 0
925
926{-
927************************************************************************
928*                                                                      *
929\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
930*                                                                      *
931************************************************************************
932
933We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
934we ``couldn't possibly use'' on the other side.  Can be overridden w/
935flaggery.  Just the same as smallEnoughToInline, except that it has no
936actual arguments.
937-}
938
939couldBeSmallEnoughToInline :: UnfoldingOpts -> Int -> CoreExpr -> Bool
940couldBeSmallEnoughToInline opts threshold rhs
941  = case sizeExpr opts threshold [] body of
942       TooBig -> False
943       _      -> True
944  where
945    (_, body) = collectBinders rhs
946
947----------------
948smallEnoughToInline :: UnfoldingOpts -> Unfolding -> Bool
949smallEnoughToInline opts (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
950  = size <= unfoldingUseThreshold opts
951smallEnoughToInline _ _
952  = False
953
954----------------
955
956certainlyWillInline :: UnfoldingOpts -> IdInfo -> Maybe Unfolding
957-- ^ Sees if the unfolding is pretty certain to inline.
958-- If so, return a *stable* unfolding for it, that will always inline.
959certainlyWillInline opts fn_info
960  = case fn_unf of
961      CoreUnfolding { uf_tmpl = expr, uf_guidance = guidance, uf_src = src }
962        | loop_breaker -> Nothing       -- Won't inline, so try w/w
963        | noinline     -> Nothing       -- See Note [Worker-wrapper for NOINLINE functions]
964        | otherwise
965        -> case guidance of
966             UnfNever  -> Nothing
967             UnfWhen {} -> Just (fn_unf { uf_src = src' })
968                             -- INLINE functions have UnfWhen
969             UnfIfGoodArgs { ug_size = size, ug_args = args }
970               -> do_cunf expr size args src'
971        where
972          src' = case src of
973                   InlineRhs -> InlineStable
974                   _         -> src  -- Do not change InlineCompulsory!
975
976      DFunUnfolding {} -> Just fn_unf  -- Don't w/w DFuns; it never makes sense
977                                       -- to do so, and even if it is currently a
978                                       -- loop breaker, it may not be later
979
980      _other_unf       -> Nothing
981
982  where
983    loop_breaker = isStrongLoopBreaker (occInfo fn_info)
984    noinline     = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline
985    fn_unf       = unfoldingInfo fn_info
986
987        -- The UnfIfGoodArgs case seems important.  If we w/w small functions
988        -- binary sizes go up by 10%!  (This is with SplitObjs.)
989        -- I'm not totally sure why.
990        -- INLINABLE functions come via this path
991        --    See Note [certainlyWillInline: INLINABLE]
992    do_cunf expr size args src'
993      | arityInfo fn_info > 0  -- See Note [certainlyWillInline: be careful of thunks]
994      , not (isDeadEndSig (strictnessInfo fn_info))
995              -- Do not unconditionally inline a bottoming functions even if
996              -- it seems smallish. We've carefully lifted it out to top level,
997              -- so we don't want to re-inline it.
998      , let unf_arity = length args
999      , size - (10 * (unf_arity + 1)) <= unfoldingUseThreshold opts
1000      = Just (fn_unf { uf_src      = src'
1001                     , uf_guidance = UnfWhen { ug_arity     = unf_arity
1002                                             , ug_unsat_ok  = unSaturatedOk
1003                                             , ug_boring_ok = inlineBoringOk expr } })
1004             -- Note the "unsaturatedOk". A function like  f = \ab. a
1005             -- will certainly inline, even if partially applied (f e), so we'd
1006             -- better make sure that the transformed inlining has the same property
1007      | otherwise
1008      = Nothing
1009
1010{- Note [certainlyWillInline: be careful of thunks]
1011~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1012Don't claim that thunks will certainly inline, because that risks work
1013duplication.  Even if the work duplication is not great (eg is_cheap
1014holds), it can make a big difference in an inner loop In #5623 we
1015found that the WorkWrap phase thought that
1016       y = case x of F# v -> F# (v +# v)
1017was certainlyWillInline, so the addition got duplicated.
1018
1019Note that we check arityInfo instead of the arity of the unfolding to detect
1020this case. This is so that we don't accidentally fail to inline small partial
1021applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2
1022(say). Here there is no risk of work duplication, and the RHS is tiny, so
1023certainlyWillInline should return True. But `unf_arity` is zero! However f's
1024arity, gotten from `arityInfo fn_info`, is 1.
1025
1026Failing to say that `f` will inline forces W/W to generate a potentially huge
1027worker for f that will immediately cancel with `g`'s wrapper anyway, causing
1028unnecessary churn in the Simplifier while arriving at the same result.
1029
1030Note [certainlyWillInline: INLINABLE]
1031~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1032certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1033even though we have a stable inlining, so that strictness w/w takes
1034place.  It makes a big difference to efficiency, and the w/w pass knows
1035how to transfer the INLINABLE info to the worker; see WorkWrap
1036Note [Worker-wrapper for INLINABLE functions]
1037
1038************************************************************************
1039*                                                                      *
1040\subsection{callSiteInline}
1041*                                                                      *
1042************************************************************************
1043
1044This is the key function.  It decides whether to inline a variable at a call site
1045
1046callSiteInline is used at call sites, so it is a bit more generous.
1047It's a very important function that embodies lots of heuristics.
1048A non-WHNF can be inlined if it doesn't occur inside a lambda,
1049and occurs exactly once or
1050    occurs once in each branch of a case and is small
1051
1052If the thing is in WHNF, there's no danger of duplicating work,
1053so we can inline if it occurs once, or is small
1054
1055NOTE: we don't want to inline top-level functions that always diverge.
1056It just makes the code bigger.  Tt turns out that the convenient way to prevent
1057them inlining is to give them a NOINLINE pragma, which we do in
1058StrictAnal.addStrictnessInfoToTopId
1059-}
1060
1061callSiteInline :: Logger
1062               -> DynFlags
1063               -> Int                   -- Case depth
1064               -> Id                    -- The Id
1065               -> Bool                  -- True <=> unfolding is active
1066               -> Bool                  -- True if there are no arguments at all (incl type args)
1067               -> [ArgSummary]          -- One for each value arg; True if it is interesting
1068               -> CallCtxt              -- True <=> continuation is interesting
1069               -> Maybe CoreExpr        -- Unfolding, if any
1070
1071data ArgSummary = TrivArg       -- Nothing interesting
1072                | NonTrivArg    -- Arg has structure
1073                | ValueArg      -- Arg is a con-app or PAP
1074                                -- ..or con-like. Note [Conlike is interesting]
1075
1076instance Outputable ArgSummary where
1077  ppr TrivArg    = text "TrivArg"
1078  ppr NonTrivArg = text "NonTrivArg"
1079  ppr ValueArg   = text "ValueArg"
1080
1081nonTriv ::  ArgSummary -> Bool
1082nonTriv TrivArg = False
1083nonTriv _       = True
1084
1085data CallCtxt
1086  = BoringCtxt
1087  | RhsCtxt             -- Rhs of a let-binding; see Note [RHS of lets]
1088  | DiscArgCtxt         -- Argument of a function with non-zero arg discount
1089  | RuleArgCtxt         -- We are somewhere in the argument of a function with rules
1090
1091  | ValAppCtxt          -- We're applied to at least one value arg
1092                        -- This arises when we have ((f x |> co) y)
1093                        -- Then the (f x) has argument 'x' but in a ValAppCtxt
1094
1095  | CaseCtxt            -- We're the scrutinee of a case
1096                        -- that decomposes its scrutinee
1097
1098instance Outputable CallCtxt where
1099  ppr CaseCtxt    = text "CaseCtxt"
1100  ppr ValAppCtxt  = text "ValAppCtxt"
1101  ppr BoringCtxt  = text "BoringCtxt"
1102  ppr RhsCtxt     = text "RhsCtxt"
1103  ppr DiscArgCtxt = text "DiscArgCtxt"
1104  ppr RuleArgCtxt = text "RuleArgCtxt"
1105
1106callSiteInline logger dflags !case_depth id active_unfolding lone_variable arg_infos cont_info
1107  = case idUnfolding id of
1108      -- idUnfolding checks for loop-breakers, returning NoUnfolding
1109      -- Things with an INLINE pragma may have an unfolding *and*
1110      -- be a loop breaker  (maybe the knot is not yet untied)
1111        CoreUnfolding { uf_tmpl = unf_template
1112                      , uf_is_work_free = is_wf
1113                      , uf_guidance = guidance, uf_expandable = is_exp }
1114          | active_unfolding -> tryUnfolding logger dflags case_depth id lone_variable
1115                                    arg_infos cont_info unf_template
1116                                    is_wf is_exp guidance
1117          | otherwise -> traceInline logger dflags id "Inactive unfolding:" (ppr id) Nothing
1118        NoUnfolding      -> Nothing
1119        BootUnfolding    -> Nothing
1120        OtherCon {}      -> Nothing
1121        DFunUnfolding {} -> Nothing     -- Never unfold a DFun
1122
1123-- | Report the inlining of an identifier's RHS to the user, if requested.
1124traceInline :: Logger -> DynFlags -> Id -> String -> SDoc -> a -> a
1125traceInline logger dflags inline_id str doc result
1126  -- We take care to ensure that doc is used in only one branch, ensuring that
1127  -- the simplifier can push its allocation into the branch. See Note [INLINE
1128  -- conditional tracing utilities].
1129  | enable    = putTraceMsg logger dflags str doc result
1130  | otherwise = result
1131  where
1132    enable
1133      | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1134      = True
1135      | Just prefix <- inlineCheck dflags
1136      = prefix `isPrefixOf` occNameString (getOccName inline_id)
1137      | otherwise
1138      = False
1139{-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities]
1140
1141{- Note [Avoid inlining into deeply nested cases]
1142   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1143
1144Consider a function f like this:
1145
1146  f arg1 arg2 =
1147    case ...
1148      ... -> g arg1
1149      ... -> g arg2
1150
1151This function is small. So should be safe to inline.
1152However sometimes this doesn't quite work out like that.
1153Consider this code:
1154
1155f1 arg1 arg2 ... = ...
1156    case _foo of
1157      alt1 -> ... f2 arg1 ...
1158      alt2 -> ... f2 arg2 ...
1159
1160f2 arg1 arg2 ... = ...
1161    case _foo of
1162      alt1 -> ... f3 arg1 ...
1163      alt2 -> ... f3 arg2 ...
1164
1165f3 arg1 arg2 ... = ...
1166
1167... repeats up to n times. And then f1 is
1168applied to some arguments:
1169
1170foo = ... f1 <interestingArgs> ...
1171
1172Initially f2..fn are not interesting to inline so we don't.
1173However we see that f1 is applied to interesting args.
1174So it's an obvious choice to inline those:
1175
1176foo =
1177    ...
1178      case _foo of
1179        alt1 -> ... f2 <interestingArg> ...
1180        alt2 -> ... f2 <interestingArg> ...
1181
1182As a result we go and inline f2 both mentions of f2 in turn are now applied to interesting
1183arguments and f2 is small:
1184
1185foo =
1186    ...
1187      case _foo of
1188        alt1 -> ... case _foo of
1189            alt1 -> ... f3 <interestingArg> ...
1190            alt2 -> ... f3 <interestingArg> ...
1191
1192        alt2 -> ... case _foo of
1193            alt1 -> ... f3 <interestingArg> ...
1194            alt2 -> ... f3 <interestingArg> ...
1195
1196The same thing happens for each binding up to f_n, duplicating the amount of inlining
1197done in each step. Until at some point we are either done or run out of simplifier
1198ticks/RAM. This pattern happened #18730.
1199
1200To combat this we introduce one more heuristic when weighing inlining decision.
1201We keep track of a "case-depth". Which increases each time we look inside a case
1202expression with more than one alternative.
1203
1204We then apply a penalty to inlinings based on the case-depth at which they would
1205be inlined. Bounding the number of inlinings in such a scenario.
1206
1207The heuristic can be tuned in two ways:
1208
1209* We can ignore the first n levels of case nestings for inlining decisions using
1210  -funfolding-case-threshold.
1211* The penalty grows linear with the depth. It's computed as size*(depth-threshold)/scaling.
1212  Scaling can be set with -funfolding-case-scaling.
1213
1214Some guidance on setting these defaults:
1215
1216* A low treshold (<= 2) is needed to prevent exponential cases from spiraling out of
1217  control. We picked 2 for no particular reason.
1218* Scaling the penalty by any more than 30 means the reproducer from
1219  T18730 won't compile even with reasonably small values of n. Instead
1220  it will run out of runs/ticks. This means to positively affect the reproducer
1221  a scaling <= 30 is required.
1222* A scaling of >= 15 still causes a few very large regressions on some nofib benchmarks.
1223  (+80% for gc/fulsom, +90% for real/ben-raytrace, +20% for spectral/fibheaps)
1224* A scaling of >= 25 showed no regressions on nofib. However it showed a number of
1225  (small) regression for compiler perf benchmarks.
1226
1227The end result is that we are settling for a scaling of 30, with a threshold of 2.
1228This gives us minimal compiler perf regressions. No nofib runtime regressions and
1229will still avoid this pattern sometimes. This is a "safe" default, where we err on
1230the side of compiler blowup instead of risking runtime regressions.
1231
1232For cases where the default falls short the flag can be changed to allow more/less inlining as
1233needed on a per-module basis.
1234
1235-}
1236
1237tryUnfolding :: Logger -> DynFlags -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt
1238             -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1239             -> Maybe CoreExpr
1240tryUnfolding logger dflags !case_depth id lone_variable
1241             arg_infos cont_info unf_template
1242             is_wf is_exp guidance
1243 = case guidance of
1244     UnfNever -> traceInline logger dflags id str (text "UnfNever") Nothing
1245
1246     UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1247        | enough_args && (boring_ok || some_benefit || unfoldingVeryAggressive uf_opts)
1248                -- See Note [INLINE for small functions (3)]
1249        -> traceInline logger dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1250        | otherwise
1251        -> traceInline logger dflags id str (mk_doc some_benefit empty False) Nothing
1252        where
1253          some_benefit = calc_some_benefit uf_arity
1254          enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1255
1256     UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1257        | unfoldingVeryAggressive uf_opts
1258        -> traceInline logger dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1259        | is_wf && some_benefit && small_enough
1260        -> traceInline logger dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1261        | otherwise
1262        -> traceInline logger dflags id str (mk_doc some_benefit extra_doc False) Nothing
1263        where
1264          some_benefit = calc_some_benefit (length arg_discounts)
1265          extra_doc = vcat [ text "case depth =" <+> int case_depth
1266                           , text "depth based penalty =" <+> int depth_penalty
1267                           , text "discounted size =" <+> int adjusted_size ]
1268          -- See Note [Avoid inlining into deeply nested cases]
1269          depth_treshold = unfoldingCaseThreshold uf_opts
1270          depth_scaling = unfoldingCaseScaling uf_opts
1271          depth_penalty | case_depth <= depth_treshold = 0
1272                        | otherwise       = (size * (case_depth - depth_treshold)) `div` depth_scaling
1273          adjusted_size = size + depth_penalty - discount
1274          small_enough = adjusted_size <= unfoldingUseThreshold uf_opts
1275          discount = computeDiscount arg_discounts res_discount arg_infos cont_info
1276
1277  where
1278    uf_opts = unfoldingOpts dflags
1279    mk_doc some_benefit extra_doc yes_or_no
1280      = vcat [ text "arg infos" <+> ppr arg_infos
1281             , text "interesting continuation" <+> ppr cont_info
1282             , text "some_benefit" <+> ppr some_benefit
1283             , text "is exp:" <+> ppr is_exp
1284             , text "is work-free:" <+> ppr is_wf
1285             , text "guidance" <+> ppr guidance
1286             , extra_doc
1287             , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1288
1289    ctx = initSDocContext dflags defaultDumpStyle
1290    str = "Considering inlining: " ++ showSDocDump ctx (ppr id)
1291    n_val_args = length arg_infos
1292
1293           -- some_benefit is used when the RHS is small enough
1294           -- and the call has enough (or too many) value
1295           -- arguments (ie n_val_args >= arity). But there must
1296           -- be *something* interesting about some argument, or the
1297           -- result context, to make it worth inlining
1298    calc_some_benefit :: Arity -> Bool   -- The Arity is the number of args
1299                                         -- expected by the unfolding
1300    calc_some_benefit uf_arity
1301       | not saturated = interesting_args       -- Under-saturated
1302                                        -- Note [Unsaturated applications]
1303       | otherwise = interesting_args   -- Saturated or over-saturated
1304                  || interesting_call
1305      where
1306        saturated      = n_val_args >= uf_arity
1307        over_saturated = n_val_args > uf_arity
1308        interesting_args = any nonTriv arg_infos
1309                -- NB: (any nonTriv arg_infos) looks at the
1310                -- over-saturated args too which is "wrong";
1311                -- but if over-saturated we inline anyway.
1312
1313        interesting_call
1314          | over_saturated
1315          = True
1316          | otherwise
1317          = case cont_info of
1318              CaseCtxt   -> not (lone_variable && is_exp)  -- Note [Lone variables]
1319              ValAppCtxt -> True                           -- Note [Cast then apply]
1320              RuleArgCtxt -> uf_arity > 0  -- See Note [Unfold info lazy contexts]
1321              DiscArgCtxt -> uf_arity > 0  -- Note [Inlining in ArgCtxt]
1322              RhsCtxt     -> uf_arity > 0  --
1323              _other      -> False         -- See Note [Nested functions]
1324
1325
1326{-
1327Note [Unfold into lazy contexts], Note [RHS of lets]
1328~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1329When the call is the argument of a function with a RULE, or the RHS of a let,
1330we are a little bit keener to inline.  For example
1331     f y = (y,y,y)
1332     g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1333We'd inline 'f' if the call was in a case context, and it kind-of-is,
1334only we can't see it.  Also
1335     x = f v
1336could be expensive whereas
1337     x = case v of (a,b) -> a
1338is patently cheap and may allow more eta expansion.
1339So we treat the RHS of a let as not-totally-boring.
1340
1341Note [Unsaturated applications]
1342~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1343When a call is not saturated, we *still* inline if one of the
1344arguments has interesting structure.  That's sometimes very important.
1345A good example is the Ord instance for Bool in Base:
1346
1347 Rec {
1348    $fOrdBool =GHC.Classes.D:Ord
1349                 @ Bool
1350                 ...
1351                 $cmin_ajX
1352
1353    $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1354    $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1355  }
1356
1357But the defn of GHC.Classes.$dmmin is:
1358
1359  $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1360    {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1361       Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1362                   case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1363                     GHC.Types.False -> y GHC.Types.True -> x }) -}
1364
1365We *really* want to inline $dmmin, even though it has arity 3, in
1366order to unravel the recursion.
1367
1368
1369Note [Things to watch]
1370~~~~~~~~~~~~~~~~~~~~~~
1371*   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1372    Assume x is exported, so not inlined unconditionally.
1373    Then we want x to inline unconditionally; no reason for it
1374    not to, and doing so avoids an indirection.
1375
1376*   { x = I# 3; ....f x.... }
1377    Make sure that x does not inline unconditionally!
1378    Lest we get extra allocation.
1379
1380Note [Inlining an InlineRule]
1381~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1382An InlineRules is used for
1383  (a) programmer INLINE pragmas
1384  (b) inlinings from worker/wrapper
1385
1386For (a) the RHS may be large, and our contract is that we *only* inline
1387when the function is applied to all the arguments on the LHS of the
1388source-code defn.  (The uf_arity in the rule.)
1389
1390However for worker/wrapper it may be worth inlining even if the
1391arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1392require saturation.
1393
1394Note [Nested functions]
1395~~~~~~~~~~~~~~~~~~~~~~~
1396At one time we treated a call of a non-top-level function as
1397"interesting" (regardless of how boring the context) in the hope
1398that inlining it would eliminate the binding, and its allocation.
1399Specifically, in the default case of interesting_call we had
1400   _other -> not is_top && uf_arity > 0
1401
1402But actually postInlineUnconditionally does some of this and overall
1403it makes virtually no difference to nofib.  So I simplified away this
1404special case
1405
1406Note [Cast then apply]
1407~~~~~~~~~~~~~~~~~~~~~~
1408Consider
1409   myIndex = __inline_me ( (/\a. <blah>) |> co )
1410   co :: (forall a. a -> a) ~ (forall a. T a)
1411     ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1412
1413We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1414no value arguments.  The ValAppCtxt gives it enough incentive to inline.
1415
1416Note [Inlining in ArgCtxt]
1417~~~~~~~~~~~~~~~~~~~~~~~~~~
1418The condition (arity > 0) here is very important, because otherwise
1419we end up inlining top-level stuff into useless places; eg
1420   x = I# 3#
1421   f = \y.  g x
1422This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1423and 20% to 'power'.
1424
1425At one stage I replaced this condition by 'True' (leading to the above
1426slow-down).  The motivation was test eyeball/inline1.hs; but that seems
1427to work ok now.
1428
1429NOTE: arguably, we should inline in ArgCtxt only if the result of the
1430call is at least CONLIKE.  At least for the cases where we use ArgCtxt
1431for the RHS of a 'let', we only profit from the inlining if we get a
1432CONLIKE thing (modulo lets).
1433
1434Note [Lone variables]   See also Note [Interaction of exprIsWorkFree and lone variables]
1435~~~~~~~~~~~~~~~~~~~~~   which appears below
1436The "lone-variable" case is important.  I spent ages messing about
1437with unsatisfactory variants, but this is nice.  The idea is that if a
1438variable appears all alone
1439
1440        as an arg of lazy fn, or rhs    BoringCtxt
1441        as scrutinee of a case          CaseCtxt
1442        as arg of a fn                  ArgCtxt
1443AND
1444        it is bound to a cheap expression
1445
1446then we should not inline it (unless there is some other reason,
1447e.g. it is the sole occurrence).  That is what is happening at
1448the use of 'lone_variable' in 'interesting_call'.
1449
1450Why?  At least in the case-scrutinee situation, turning
1451        let x = (a,b) in case x of y -> ...
1452into
1453        let x = (a,b) in case (a,b) of y -> ...
1454and thence to
1455        let x = (a,b) in let y = (a,b) in ...
1456is bad if the binding for x will remain.
1457
1458Another example: I discovered that strings
1459were getting inlined straight back into applications of 'error'
1460because the latter is strict.
1461        s = "foo"
1462        f = \x -> ...(error s)...
1463
1464Fundamentally such contexts should not encourage inlining because, provided
1465the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the
1466context can ``see'' the unfolding of the variable (e.g. case or a
1467RULE) so there's no gain.
1468
1469However, watch out:
1470
1471 * Consider this:
1472        foo = _inline_ (\n. [n])
1473        bar = _inline_ (foo 20)
1474        baz = \n. case bar of { (m:_) -> m + n }
1475   Here we really want to inline 'bar' so that we can inline 'foo'
1476   and the whole thing unravels as it should obviously do.  This is
1477   important: in the NDP project, 'bar' generates a closure data
1478   structure rather than a list.
1479
1480   So the non-inlining of lone_variables should only apply if the
1481   unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1482   looks through the unfolding.  Hence the "&& is_wf" in the
1483   InlineRule branch.
1484
1485 * Even a type application or coercion isn't a lone variable.
1486   Consider
1487        case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1488   We had better inline that sucker!  The case won't see through it.
1489
1490   For now, I'm treating treating a variable applied to types
1491   in a *lazy* context "lone". The motivating example was
1492        f = /\a. \x. BIG
1493        g = /\a. \y.  h (f a)
1494   There's no advantage in inlining f here, and perhaps
1495   a significant disadvantage.  Hence some_val_args in the Stop case
1496
1497Note [Interaction of exprIsWorkFree and lone variables]
1498~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1499The lone-variable test says "don't inline if a case expression
1500scrutinises a lone variable whose unfolding is cheap".  It's very
1501important that, under these circumstances, exprIsConApp_maybe
1502can spot a constructor application. So, for example, we don't
1503consider
1504        let x = e in (x,x)
1505to be cheap, and that's good because exprIsConApp_maybe doesn't
1506think that expression is a constructor application.
1507
1508In the 'not (lone_variable && is_wf)' test, I used to test is_value
1509rather than is_wf, which was utterly wrong, because the above
1510expression responds True to exprIsHNF, which is what sets is_value.
1511
1512This kind of thing can occur if you have
1513
1514        {-# INLINE foo #-}
1515        foo = let x = e in (x,x)
1516
1517which Roman did.
1518
1519
1520-}
1521
1522computeDiscount :: [Int] -> Int -> [ArgSummary] -> CallCtxt
1523                -> Int
1524computeDiscount arg_discounts res_discount arg_infos cont_info
1525
1526  = 10          -- Discount of 10 because the result replaces the call
1527                -- so we count 10 for the function itself
1528
1529    + 10 * length actual_arg_discounts
1530               -- Discount of 10 for each arg supplied,
1531               -- because the result replaces the call
1532
1533    + total_arg_discount + res_discount'
1534  where
1535    actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1536    total_arg_discount   = sum actual_arg_discounts
1537
1538    mk_arg_discount _        TrivArg    = 0
1539    mk_arg_discount _        NonTrivArg = 10
1540    mk_arg_discount discount ValueArg   = discount
1541
1542    res_discount'
1543      | LT <- arg_discounts `compareLength` arg_infos
1544      = res_discount   -- Over-saturated
1545      | otherwise
1546      = case cont_info of
1547           BoringCtxt  -> 0
1548           CaseCtxt    -> res_discount  -- Presumably a constructor
1549           ValAppCtxt  -> res_discount  -- Presumably a function
1550           _           -> 40 `min` res_discount
1551                -- ToDo: this 40 `min` res_discount doesn't seem right
1552                --   for DiscArgCtxt it shouldn't matter because the function will
1553                --       get the arg discount for any non-triv arg
1554                --   for RuleArgCtxt we do want to be keener to inline; but not only
1555                --       constructor results
1556                --   for RhsCtxt I suppose that exposing a data con is good in general
1557                --   And 40 seems very arbitrary
1558                --
1559                -- res_discount can be very large when a function returns
1560                -- constructors; but we only want to invoke that large discount
1561                -- when there's a case continuation.
1562                -- Otherwise we, rather arbitrarily, threshold it.  Yuk.
1563                -- But we want to avoid inlining large functions that return
1564                -- constructors into contexts that are simply "interesting"
1565