1{-
2(c) The University of Glasgow 2006
3(c) The AQUA Project, Glasgow University, 1994-1998
4
5
6Core-syntax unfoldings
7
8Unfoldings (which can travel across module boundaries) are in Core
9syntax (namely @CoreExpr@s).
10
11The type @Unfolding@ sits ``above'' simply-Core-expressions
12unfoldings, capturing ``higher-level'' things we know about a binding,
13usually things that the simplifier found out (e.g., ``it's a
14literal'').  In the corner of a @CoreUnfolding@ unfolding, you will
15find, unsurprisingly, a Core expression.
16-}
17
18{-# LANGUAGE CPP #-}
19
20{-# OPTIONS_GHC -Wno-incomplete-record-updates #-}
21
22module GHC.Core.Unfold (
23        Unfolding, UnfoldingGuidance,   -- Abstract types
24
25        noUnfolding,
26        mkUnfolding, mkCoreUnfolding,
27        mkFinalUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
28        mkInlineUnfolding, mkInlineUnfoldingWithArity,
29        mkInlinableUnfolding, mkWwInlineRule,
30        mkCompulsoryUnfolding, mkDFunUnfolding,
31        specUnfolding,
32
33        ArgSummary(..),
34
35        couldBeSmallEnoughToInline, inlineBoringOk,
36        certainlyWillInline, smallEnoughToInline,
37
38        callSiteInline, CallCtxt(..),
39
40        -- Reexport from GHC.Core.Subst (it only live there so it can be used
41        -- by the Very Simple Optimiser)
42        exprIsConApp_maybe, exprIsLiteral_maybe
43    ) where
44
45#include "GhclibHsVersions.h"
46
47import GHC.Prelude
48
49import GHC.Driver.Session
50import GHC.Core
51import GHC.Core.Opt.OccurAnal ( occurAnalyseExpr )
52import GHC.Core.SimpleOpt
53import GHC.Core.Opt.Arity   ( manifestArity )
54import GHC.Core.Utils
55import GHC.Types.Id
56import GHC.Types.Demand ( StrictSig, isDeadEndSig )
57import GHC.Core.DataCon
58import GHC.Types.Literal
59import GHC.Builtin.PrimOps
60import GHC.Types.Id.Info
61import GHC.Types.Basic  ( Arity, InlineSpec(..), inlinePragmaSpec )
62import GHC.Core.Type
63import GHC.Builtin.Names
64import GHC.Builtin.Types.Prim ( realWorldStatePrimTy )
65import GHC.Data.Bag
66import GHC.Utils.Misc
67import GHC.Utils.Outputable
68import GHC.Types.ForeignCall
69import GHC.Types.Name
70import GHC.Utils.Error
71
72import qualified Data.ByteString as BS
73import Data.List
74
75{-
76************************************************************************
77*                                                                      *
78\subsection{Making unfoldings}
79*                                                                      *
80************************************************************************
81-}
82
83mkFinalUnfolding :: DynFlags -> UnfoldingSource -> StrictSig -> CoreExpr -> Unfolding
84-- "Final" in the sense that this is a GlobalId that will not be further
85-- simplified; so the unfolding should be occurrence-analysed
86mkFinalUnfolding dflags src strict_sig expr
87  = mkUnfolding dflags src
88                True {- Top level -}
89                (isDeadEndSig strict_sig)
90                expr
91
92mkCompulsoryUnfolding :: CoreExpr -> Unfolding
93mkCompulsoryUnfolding expr         -- Used for things that absolutely must be unfolded
94  = mkCoreUnfolding InlineCompulsory True
95                    (simpleOptExpr unsafeGlobalDynFlags expr)
96                    (UnfWhen { ug_arity = 0    -- Arity of unfolding doesn't matter
97                             , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
98
99
100-- Note [Top-level flag on inline rules]
101-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
102-- Slight hack: note that mk_inline_rules conservatively sets the
103-- top-level flag to True.  It gets set more accurately by the simplifier
104-- Simplify.simplUnfolding.
105
106mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
107mkSimpleUnfolding dflags rhs
108  = mkUnfolding dflags InlineRhs False False rhs
109
110mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
111mkDFunUnfolding bndrs con ops
112  = DFunUnfolding { df_bndrs = bndrs
113                  , df_con = con
114                  , df_args = map occurAnalyseExpr ops }
115                  -- See Note [Occurrence analysis of unfoldings]
116
117mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding
118mkWwInlineRule dflags expr arity
119  = mkCoreUnfolding InlineStable True
120                   (simpleOptExpr dflags expr)
121                   (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
122                            , ug_boring_ok = boringCxtNotOk })
123
124mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
125-- See Note [Worker-wrapper for INLINABLE functions] in GHC.Core.Opt.WorkWrap
126mkWorkerUnfolding dflags work_fn
127                  (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
128                                 , uf_is_top = top_lvl })
129  | isStableSource src
130  = mkCoreUnfolding src top_lvl new_tmpl guidance
131  where
132    new_tmpl = simpleOptExpr dflags (work_fn tmpl)
133    guidance = calcUnfoldingGuidance dflags False new_tmpl
134
135mkWorkerUnfolding _ _ _ = noUnfolding
136
137-- | Make an unfolding that may be used unsaturated
138-- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
139-- manifest arity (the number of outer lambdas applications will
140-- resolve before doing any work).
141mkInlineUnfolding :: CoreExpr -> Unfolding
142mkInlineUnfolding expr
143  = mkCoreUnfolding InlineStable
144                    True         -- Note [Top-level flag on inline rules]
145                    expr' guide
146  where
147    expr' = simpleOptExpr unsafeGlobalDynFlags expr
148    guide = UnfWhen { ug_arity = manifestArity expr'
149                    , ug_unsat_ok = unSaturatedOk
150                    , ug_boring_ok = boring_ok }
151    boring_ok = inlineBoringOk expr'
152
153-- | Make an unfolding that will be used once the RHS has been saturated
154-- to the given arity.
155mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
156mkInlineUnfoldingWithArity arity expr
157  = mkCoreUnfolding InlineStable
158                    True         -- Note [Top-level flag on inline rules]
159                    expr' guide
160  where
161    expr' = simpleOptExpr unsafeGlobalDynFlags expr
162    guide = UnfWhen { ug_arity = arity
163                    , ug_unsat_ok = needSaturated
164                    , ug_boring_ok = boring_ok }
165    -- See Note [INLINE pragmas and boring contexts] as to why we need to look
166    -- at the arity here.
167    boring_ok | arity == 0 = True
168              | otherwise  = inlineBoringOk expr'
169
170mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
171mkInlinableUnfolding dflags expr
172  = mkUnfolding dflags InlineStable False False expr'
173  where
174    expr' = simpleOptExpr dflags expr
175
176specUnfolding :: DynFlags
177              -> [Var] -> (CoreExpr -> CoreExpr)
178              -> [CoreArg]   -- LHS arguments in the RULE
179              -> Unfolding -> Unfolding
180-- See Note [Specialising unfoldings]
181-- specUnfolding spec_bndrs spec_args unf
182--   = \spec_bndrs. unf spec_args
183--
184specUnfolding dflags spec_bndrs spec_app rule_lhs_args
185              df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
186  = ASSERT2( rule_lhs_args `equalLength` old_bndrs
187           , ppr df $$ ppr rule_lhs_args )
188           -- For this ASSERT see Note [DFunUnfoldings] in GHC.Core.Opt.Specialise
189    mkDFunUnfolding spec_bndrs con (map spec_arg args)
190      -- For DFunUnfoldings we transform
191      --       \obs. MkD <op1> ... <opn>
192      -- to
193      --       \sbs. MkD ((\obs. <op1>) spec_args) ... ditto <opn>
194  where
195    spec_arg arg = simpleOptExpr dflags $
196                   spec_app (mkLams old_bndrs arg)
197                   -- The beta-redexes created by spec_app will be
198                   -- simplified away by simplOptExpr
199
200specUnfolding dflags spec_bndrs spec_app rule_lhs_args
201              (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
202                             , uf_is_top = top_lvl
203                             , uf_guidance = old_guidance })
204 | isStableSource src  -- See Note [Specialising unfoldings]
205 , UnfWhen { ug_arity     = old_arity } <- old_guidance
206 = mkCoreUnfolding src top_lvl new_tmpl
207                   (old_guidance { ug_arity = old_arity - arity_decrease })
208 where
209   new_tmpl = simpleOptExpr dflags $
210              mkLams spec_bndrs    $
211              spec_app tmpl  -- The beta-redexes created by spec_app
212                             -- will besimplified away by simplOptExpr
213   arity_decrease = count isValArg rule_lhs_args - count isId spec_bndrs
214
215
216specUnfolding _ _ _ _ _ = noUnfolding
217
218{- Note [Specialising unfoldings]
219~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
220When we specialise a function for some given type-class arguments, we use
221specUnfolding to specialise its unfolding.  Some important points:
222
223* If the original function has a DFunUnfolding, the specialised one
224  must do so too!  Otherwise we lose the magic rules that make it
225  interact with ClassOps
226
227* There is a bit of hack for INLINABLE functions:
228     f :: Ord a => ....
229     f = <big-rhs>
230     {- INLINABLE f #-}
231  Now if we specialise f, should the specialised version still have
232  an INLINABLE pragma?  If it does, we'll capture a specialised copy
233  of <big-rhs> as its unfolding, and that probably won't inline.  But
234  if we don't, the specialised version of <big-rhs> might be small
235  enough to inline at a call site. This happens with Control.Monad.liftM3,
236  and can cause a lot more allocation as a result (nofib n-body shows this).
237
238  Moreover, keeping the INLINABLE thing isn't much help, because
239  the specialised function (probably) isn't overloaded any more.
240
241  Conclusion: drop the INLINEALE pragma.  In practice what this means is:
242     if a stable unfolding has UnfoldingGuidance of UnfWhen,
243        we keep it (so the specialised thing too will always inline)
244     if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
245        (which arises from INLINABLE), we discard it
246
247Note [Honour INLINE on 0-ary bindings]
248~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
249Consider
250
251   x = <expensive>
252   {-# INLINE x #-}
253
254   f y = ...x...
255
256The semantics of an INLINE pragma is
257
258  inline x at every call site, provided it is saturated;
259  that is, applied to at least as many arguments as appear
260  on the LHS of the Haskell source definition.
261
262(This source-code-derived arity is stored in the `ug_arity` field of
263the `UnfoldingGuidance`.)
264
265In the example, x's ug_arity is 0, so we should inline it at every use
266site.  It's rare to have such an INLINE pragma (usually INLINE Is on
267functions), but it's occasionally very important (#15578, #15519).
268In #15519 we had something like
269   x = case (g a b) of I# r -> T r
270   {-# INLINE x #-}
271   f y = ...(h x)....
272
273where h is strict.  So we got
274   f y = ...(case g a b of I# r -> h (T r))...
275
276and that in turn allowed SpecConstr to ramp up performance.
277
278How do we deliver on this?  By adjusting the ug_boring_ok
279flag in mkInlineUnfoldingWithArity; see
280Note [INLINE pragmas and boring contexts]
281
282NB: there is a real risk that full laziness will float it right back
283out again. Consider again
284  x = factorial 200
285  {-# INLINE x #-}
286  f y = ...x...
287
288After inlining we get
289  f y = ...(factorial 200)...
290
291but it's entirely possible that full laziness will do
292  lvl23 = factorial 200
293  f y = ...lvl23...
294
295That's a problem for another day.
296
297Note [INLINE pragmas and boring contexts]
298~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
299An INLINE pragma uses mkInlineUnfoldingWithArity to build the
300unfolding.  That sets the ug_boring_ok flag to False if the function
301is not tiny (inlineBoringOK), so that even INLINE functions are not
302inlined in an utterly boring context.  E.g.
303     \x y. Just (f y x)
304Nothing is gained by inlining f here, even if it has an INLINE
305pragma.
306
307But for 0-ary bindings, we want to inline regardless; see
308Note [Honour INLINE on 0-ary bindings].
309
310I'm a bit worried that it's possible for the same kind of problem
311to arise for non-0-ary functions too, but let's wait and see.
312-}
313
314mkUnfolding :: DynFlags -> UnfoldingSource
315            -> Bool       -- Is top-level
316            -> Bool       -- Definitely a bottoming binding
317                          -- (only relevant for top-level bindings)
318            -> CoreExpr
319            -> Unfolding
320-- Calculates unfolding guidance
321-- Occurrence-analyses the expression before capturing it
322mkUnfolding dflags src top_lvl is_bottoming expr
323  = mkCoreUnfolding src top_lvl expr guidance
324  where
325    is_top_bottoming = top_lvl && is_bottoming
326    guidance         = calcUnfoldingGuidance dflags is_top_bottoming expr
327        -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
328        -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
329
330mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
331                -> UnfoldingGuidance -> Unfolding
332-- Occurrence-analyses the expression before capturing it
333mkCoreUnfolding src top_lvl expr guidance
334  = CoreUnfolding { uf_tmpl         = occurAnalyseExpr expr,
335                      -- See Note [Occurrence analysis of unfoldings]
336                    uf_src          = src,
337                    uf_is_top       = top_lvl,
338                    uf_is_value     = exprIsHNF        expr,
339                    uf_is_conlike   = exprIsConLike    expr,
340                    uf_is_work_free = exprIsWorkFree   expr,
341                    uf_expandable   = exprIsExpandable expr,
342                    uf_guidance     = guidance }
343
344
345{-
346Note [Occurrence analysis of unfoldings]
347~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
348We do occurrence-analysis of unfoldings once and for all, when the
349unfolding is built, rather than each time we inline them.
350
351But given this decision it's vital that we do
352*always* do it.  Consider this unfolding
353    \x -> letrec { f = ...g...; g* = f } in body
354where g* is (for some strange reason) the loop breaker.  If we don't
355occ-anal it when reading it in, we won't mark g as a loop breaker, and
356we may inline g entirely in body, dropping its binding, and leaving
357the occurrence in f out of scope. This happened in #8892, where
358the unfolding in question was a DFun unfolding.
359
360But more generally, the simplifier is designed on the
361basis that it is looking at occurrence-analysed expressions, so better
362ensure that they actually are.
363
364Note [Calculate unfolding guidance on the non-occ-anal'd expression]
365~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
366Notice that we give the non-occur-analysed expression to
367calcUnfoldingGuidance.  In some ways it'd be better to occur-analyse
368first; for example, sometimes during simplification, there's a large
369let-bound thing which has been substituted, and so is now dead; so
370'expr' contains two copies of the thing while the occurrence-analysed
371expression doesn't.
372
373Nevertheless, we *don't* and *must not* occ-analyse before computing
374the size because
375
376a) The size computation bales out after a while, whereas occurrence
377   analysis does not.
378
379b) Residency increases sharply if you occ-anal first.  I'm not
380   100% sure why, but it's a large effect.  Compiling Cabal went
381   from residency of 534M to over 800M with this one change.
382
383This can occasionally mean that the guidance is very pessimistic;
384it gets fixed up next round.  And it should be rare, because large
385let-bound things that are dead are usually caught by preInlineUnconditionally
386
387
388************************************************************************
389*                                                                      *
390\subsection{The UnfoldingGuidance type}
391*                                                                      *
392************************************************************************
393-}
394
395inlineBoringOk :: CoreExpr -> Bool
396-- See Note [INLINE for small functions]
397-- True => the result of inlining the expression is
398--         no bigger than the expression itself
399--     eg      (\x y -> f y x)
400-- This is a quick and dirty version. It doesn't attempt
401-- to deal with  (\x y z -> x (y z))
402-- The really important one is (x `cast` c)
403inlineBoringOk e
404  = go 0 e
405  where
406    go :: Int -> CoreExpr -> Bool
407    go credit (Lam x e) | isId x           = go (credit+1) e
408                        | otherwise        = go credit e
409        -- See Note [Count coercion arguments in boring contexts]
410    go credit (App f (Type {}))            = go credit f
411    go credit (App f a) | credit > 0
412                        , exprIsTrivial a  = go (credit-1) f
413    go credit (Tick _ e)                   = go credit e -- dubious
414    go credit (Cast e _)                   = go credit e
415    go credit (Case scrut _ _ [(_,_,rhs)]) -- See Note [Inline unsafeCoerce]
416      | isUnsafeEqualityProof scrut        = go credit rhs
417    go _      (Var {})                     = boringCxtOk
418    go _      _                            = boringCxtNotOk
419
420calcUnfoldingGuidance
421        :: DynFlags
422        -> Bool          -- Definitely a top-level, bottoming binding
423        -> CoreExpr      -- Expression to look at
424        -> UnfoldingGuidance
425calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
426  | not (tickishIsCode t)  -- non-code ticks don't matter for unfolding
427  = calcUnfoldingGuidance dflags is_top_bottoming expr
428calcUnfoldingGuidance dflags is_top_bottoming expr
429  = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
430      TooBig -> UnfNever
431      SizeIs size cased_bndrs scrut_discount
432        | uncondInline expr n_val_bndrs size
433        -> UnfWhen { ug_unsat_ok = unSaturatedOk
434                   , ug_boring_ok =  boringCxtOk
435                   , ug_arity = n_val_bndrs }   -- Note [INLINE for small functions]
436
437        | is_top_bottoming
438        -> UnfNever   -- See Note [Do not inline top-level bottoming functions]
439
440        | otherwise
441        -> UnfIfGoodArgs { ug_args  = map (mk_discount cased_bndrs) val_bndrs
442                         , ug_size  = size
443                         , ug_res   = scrut_discount }
444
445  where
446    (bndrs, body) = collectBinders expr
447    bOMB_OUT_SIZE = ufCreationThreshold dflags
448           -- Bomb out if size gets bigger than this
449    val_bndrs   = filter isId bndrs
450    n_val_bndrs = length val_bndrs
451
452    mk_discount :: Bag (Id,Int) -> Id -> Int
453    mk_discount cbs bndr = foldl' combine 0 cbs
454           where
455             combine acc (bndr', disc)
456               | bndr == bndr' = acc `plus_disc` disc
457               | otherwise     = acc
458
459             plus_disc :: Int -> Int -> Int
460             plus_disc | isFunTy (idType bndr) = max
461                       | otherwise             = (+)
462             -- See Note [Function and non-function discounts]
463
464{- Note [Inline unsafeCoerce]
465~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
466We really want to inline unsafeCoerce, even when applied to boring
467arguments.  It doesn't look as if its RHS is smaller than the call
468   unsafeCoerce x = case unsafeEqualityProof @a @b of UnsafeRefl -> x
469but that case is discarded -- see Note [Implementing unsafeCoerce]
470in base:Unsafe.Coerce.
471
472Moreover, if we /don't/ inline it, we may be left with
473          f (unsafeCoerce x)
474which will build a thunk -- bad, bad, bad.
475
476Conclusion: we really want inlineBoringOk to be True of the RHS of
477unsafeCoerce.  This is (U4a) in Note [Implementing unsafeCoerce].
478
479Note [Computing the size of an expression]
480~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
481The basic idea of sizeExpr is obvious enough: count nodes.  But getting the
482heuristics right has taken a long time.  Here's the basic strategy:
483
484    * Variables, literals: 0
485      (Exception for string literals, see litSize.)
486
487    * Function applications (f e1 .. en): 1 + #value args
488
489    * Constructor applications: 1, regardless of #args
490
491    * Let(rec): 1 + size of components
492
493    * Note, cast: 0
494
495Examples
496
497  Size  Term
498  --------------
499    0     42#
500    0     x
501    0     True
502    2     f x
503    1     Just x
504    4     f (g x)
505
506Notice that 'x' counts 0, while (f x) counts 2.  That's deliberate: there's
507a function call to account for.  Notice also that constructor applications
508are very cheap, because exposing them to a caller is so valuable.
509
510[25/5/11] All sizes are now multiplied by 10, except for primops
511(which have sizes like 1 or 4.  This makes primops look fantastically
512cheap, and seems to be almost universally beneficial.  Done partly as a
513result of #4978.
514
515Note [Do not inline top-level bottoming functions]
516~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
517The FloatOut pass has gone to some trouble to float out calls to 'error'
518and similar friends.  See Note [Bottoming floats] in GHC.Core.Opt.SetLevels.
519Do not re-inline them!  But we *do* still inline if they are very small
520(the uncondInline stuff).
521
522Note [INLINE for small functions]
523~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
524Consider        {-# INLINE f #-}
525                f x = Just x
526                g y = f y
527Then f's RHS is no larger than its LHS, so we should inline it into
528even the most boring context.  In general, f the function is
529sufficiently small that its body is as small as the call itself, the
530inline unconditionally, regardless of how boring the context is.
531
532Things to note:
533
534(1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
535    than the thing it's replacing.  Notice that
536      (f x) --> (g 3)             -- YES, unconditionally
537      (f x) --> x : []            -- YES, *even though* there are two
538                                  --      arguments to the cons
539      x     --> g 3               -- NO
540      x     --> Just v            -- NO
541
542    It's very important not to unconditionally replace a variable by
543    a non-atomic term.
544
545(2) We do this even if the thing isn't saturated, else we end up with the
546    silly situation that
547       f x y = x
548       ...map (f 3)...
549    doesn't inline.  Even in a boring context, inlining without being
550    saturated will give a lambda instead of a PAP, and will be more
551    efficient at runtime.
552
553(3) However, when the function's arity > 0, we do insist that it
554    has at least one value argument at the call site.  (This check is
555    made in the UnfWhen case of callSiteInline.) Otherwise we find this:
556         f = /\a \x:a. x
557         d = /\b. MkD (f b)
558    If we inline f here we get
559         d = /\b. MkD (\x:b. x)
560    and then prepareRhs floats out the argument, abstracting the type
561    variables, so we end up with the original again!
562
563(4) We must be much more cautious about arity-zero things. Consider
564       let x = y +# z in ...
565    In *size* terms primops look very small, because the generate a
566    single instruction, but we do not want to unconditionally replace
567    every occurrence of x with (y +# z).  So we only do the
568    unconditional-inline thing for *trivial* expressions.
569
570    NB: you might think that PostInlineUnconditionally would do this
571    but it doesn't fire for top-level things; see GHC.Core.Opt.Simplify.Utils
572    Note [Top level and postInlineUnconditionally]
573
574Note [Count coercion arguments in boring contexts]
575~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
576In inlineBoringOK, we ignore type arguments when deciding whether an
577expression is okay to inline into boring contexts. This is good, since
578if we have a definition like
579
580  let y = x @Int in f y y
581
582there’s no reason not to inline y at both use sites — no work is
583actually duplicated. It may seem like the same reasoning applies to
584coercion arguments, and indeed, in #17182 we changed inlineBoringOK to
585treat coercions the same way.
586
587However, this isn’t a good idea: unlike type arguments, which have
588no runtime representation, coercion arguments *do* have a runtime
589representation (albeit the zero-width VoidRep, see Note [Coercion tokens]
590in "GHC.CoreToStg"). This caused trouble in #17787 for DataCon wrappers for
591nullary GADT constructors: the wrappers would be inlined and each use of
592the constructor would lead to a separate allocation instead of just
593sharing the wrapper closure.
594
595The solution: don’t ignore coercion arguments after all.
596-}
597
598uncondInline :: CoreExpr -> Arity -> Int -> Bool
599-- Inline unconditionally if there no size increase
600-- Size of call is arity (+1 for the function)
601-- See Note [INLINE for small functions]
602uncondInline rhs arity size
603  | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
604  | otherwise = exprIsTrivial rhs        -- See Note [INLINE for small functions] (4)
605
606sizeExpr :: DynFlags
607         -> Int             -- Bomb out if it gets bigger than this
608         -> [Id]            -- Arguments; we're interested in which of these
609                            -- get case'd
610         -> CoreExpr
611         -> ExprSize
612
613-- Note [Computing the size of an expression]
614
615sizeExpr dflags bOMB_OUT_SIZE top_args expr
616  = size_up expr
617  where
618    size_up (Cast e _) = size_up e
619    size_up (Tick _ e) = size_up e
620    size_up (Type _)   = sizeZero           -- Types cost nothing
621    size_up (Coercion _) = sizeZero
622    size_up (Lit lit)  = sizeN (litSize lit)
623    size_up (Var f) | isRealWorldId f = sizeZero
624                      -- Make sure we get constructor discounts even
625                      -- on nullary constructors
626                    | otherwise       = size_up_call f [] 0
627
628    size_up (App fun arg)
629      | isTyCoArg arg = size_up fun
630      | otherwise     = size_up arg  `addSizeNSD`
631                        size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
632
633    size_up (Lam b e)
634      | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
635      | otherwise = size_up e
636
637    size_up (Let (NonRec binder rhs) body)
638      = size_up_rhs (binder, rhs) `addSizeNSD`
639        size_up body              `addSizeN`
640        size_up_alloc binder
641
642    size_up (Let (Rec pairs) body)
643      = foldr (addSizeNSD . size_up_rhs)
644              (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
645              pairs
646
647    size_up (Case e _ _ alts)
648        | null alts
649        = size_up e    -- case e of {} never returns, so take size of scrutinee
650
651    size_up (Case e _ _ alts)
652        -- Now alts is non-empty
653        | Just v <- is_top_arg e -- We are scrutinising an argument variable
654        = let
655            alt_sizes = map size_up_alt alts
656
657                  -- alts_size tries to compute a good discount for
658                  -- the case when we are scrutinising an argument variable
659            alts_size (SizeIs tot tot_disc tot_scrut)
660                          -- Size of all alternatives
661                      (SizeIs max _        _)
662                          -- Size of biggest alternative
663                  = SizeIs tot (unitBag (v, 20 + tot - max)
664                      `unionBags` tot_disc) tot_scrut
665                          -- If the variable is known, we produce a
666                          -- discount that will take us back to 'max',
667                          -- the size of the largest alternative The
668                          -- 1+ is a little discount for reduced
669                          -- allocation in the caller
670                          --
671                          -- Notice though, that we return tot_disc,
672                          -- the total discount from all branches.  I
673                          -- think that's right.
674
675            alts_size tot_size _ = tot_size
676          in
677          alts_size (foldr1 addAltSize alt_sizes)  -- alts is non-empty
678                    (foldr1 maxSize    alt_sizes)
679                -- Good to inline if an arg is scrutinised, because
680                -- that may eliminate allocation in the caller
681                -- And it eliminates the case itself
682        where
683          is_top_arg (Var v) | v `elem` top_args = Just v
684          is_top_arg (Cast e _) = is_top_arg e
685          is_top_arg _ = Nothing
686
687
688    size_up (Case e _ _ alts) = size_up e  `addSizeNSD`
689                                foldr (addAltSize . size_up_alt) case_size alts
690      where
691          case_size
692           | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
693           | otherwise = sizeZero
694                -- Normally we don't charge for the case itself, but
695                -- we charge one per alternative (see size_up_alt,
696                -- below) to account for the cost of the info table
697                -- and comparisons.
698                --
699                -- However, in certain cases (see is_inline_scrut
700                -- below), no code is generated for the case unless
701                -- there are multiple alts.  In these cases we
702                -- subtract one, making the first alt free.
703                -- e.g. case x# +# y# of _ -> ...   should cost 1
704                --      case touch# x# of _ -> ...  should cost 0
705                -- (see #4978)
706                --
707                -- I would like to not have the "lengthAtMost alts 1"
708                -- condition above, but without that some programs got worse
709                -- (spectral/hartel/event and spectral/para).  I don't fully
710                -- understand why. (SDM 24/5/11)
711
712                -- unboxed variables, inline primops and unsafe foreign calls
713                -- are all "inline" things:
714          is_inline_scrut (Var v) = isUnliftedType (idType v)
715          is_inline_scrut scrut
716              | (Var f, _) <- collectArgs scrut
717                = case idDetails f of
718                    FCallId fc  -> not (isSafeForeignCall fc)
719                    PrimOpId op -> not (primOpOutOfLine op)
720                    _other      -> False
721              | otherwise
722                = False
723
724    size_up_rhs (bndr, rhs)
725      | Just join_arity <- isJoinId_maybe bndr
726        -- Skip arguments to join point
727      , (_bndrs, body) <- collectNBinders join_arity rhs
728      = size_up body
729      | otherwise
730      = size_up rhs
731
732    ------------
733    -- size_up_app is used when there's ONE OR MORE value args
734    size_up_app (App fun arg) args voids
735        | isTyCoArg arg                  = size_up_app fun args voids
736        | isRealWorldExpr arg            = size_up_app fun (arg:args) (voids + 1)
737        | otherwise                      = size_up arg  `addSizeNSD`
738                                           size_up_app fun (arg:args) voids
739    size_up_app (Var fun)     args voids = size_up_call fun args voids
740    size_up_app (Tick _ expr) args voids = size_up_app expr args voids
741    size_up_app (Cast expr _) args voids = size_up_app expr args voids
742    size_up_app other         args voids = size_up other `addSizeN`
743                                           callSize (length args) voids
744       -- if the lhs is not an App or a Var, or an invisible thing like a
745       -- Tick or Cast, then we should charge for a complete call plus the
746       -- size of the lhs itself.
747
748    ------------
749    size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
750    size_up_call fun val_args voids
751       = case idDetails fun of
752           FCallId _        -> sizeN (callSize (length val_args) voids)
753           DataConWorkId dc -> conSize    dc (length val_args)
754           PrimOpId op      -> primOpSize op (length val_args)
755           ClassOpId _      -> classOpSize dflags top_args val_args
756           _                -> funSize dflags top_args fun (length val_args) voids
757
758    ------------
759    size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
760        -- Don't charge for args, so that wrappers look cheap
761        -- (See comments about wrappers with Case)
762        --
763        -- IMPORTANT: *do* charge 1 for the alternative, else we
764        -- find that giant case nests are treated as practically free
765        -- A good example is Foreign.C.Error.errnoToIOError
766
767    ------------
768    -- Cost to allocate binding with given binder
769    size_up_alloc bndr
770      |  isTyVar bndr                 -- Doesn't exist at runtime
771      || isJoinId bndr                -- Not allocated at all
772      || isUnliftedType (idType bndr) -- Doesn't live in heap
773      = 0
774      | otherwise
775      = 10
776
777    ------------
778        -- These addSize things have to be here because
779        -- I don't want to give them bOMB_OUT_SIZE as an argument
780    addSizeN TooBig          _  = TooBig
781    addSizeN (SizeIs n xs d) m  = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
782
783        -- addAltSize is used to add the sizes of case alternatives
784    addAltSize TooBig            _      = TooBig
785    addAltSize _                 TooBig = TooBig
786    addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
787        = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
788                                 (xs `unionBags` ys)
789                                 (d1 + d2) -- Note [addAltSize result discounts]
790
791        -- This variant ignores the result discount from its LEFT argument
792        -- It's used when the second argument isn't part of the result
793    addSizeNSD TooBig            _      = TooBig
794    addSizeNSD _                 TooBig = TooBig
795    addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
796        = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
797                                 (xs `unionBags` ys)
798                                 d2  -- Ignore d1
799
800    isRealWorldId id = idType id `eqType` realWorldStatePrimTy
801
802    -- an expression of type State# RealWorld must be a variable
803    isRealWorldExpr (Var id)   = isRealWorldId id
804    isRealWorldExpr (Tick _ e) = isRealWorldExpr e
805    isRealWorldExpr _          = False
806
807-- | Finds a nominal size of a string literal.
808litSize :: Literal -> Int
809-- Used by GHC.Core.Unfold.sizeExpr
810litSize (LitNumber LitNumInteger _) = 100   -- Note [Size of literal integers]
811litSize (LitNumber LitNumNatural _) = 100
812litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
813        -- If size could be 0 then @f "x"@ might be too small
814        -- [Sept03: make literal strings a bit bigger to avoid fruitless
815        --  duplication of little strings]
816litSize _other = 0    -- Must match size of nullary constructors
817                      -- Key point: if  x |-> 4, then x must inline unconditionally
818                      --            (eg via case binding)
819
820classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
821-- See Note [Conlike is interesting]
822classOpSize _ _ []
823  = sizeZero
824classOpSize dflags top_args (arg1 : other_args)
825  = SizeIs size arg_discount 0
826  where
827    size = 20 + (10 * length other_args)
828    -- If the class op is scrutinising a lambda bound dictionary then
829    -- give it a discount, to encourage the inlining of this function
830    -- The actual discount is rather arbitrarily chosen
831    arg_discount = case arg1 of
832                     Var dict | dict `elem` top_args
833                              -> unitBag (dict, ufDictDiscount dflags)
834                     _other   -> emptyBag
835
836-- | The size of a function call
837callSize
838 :: Int  -- ^ number of value args
839 -> Int  -- ^ number of value args that are void
840 -> Int
841callSize n_val_args voids = 10 * (1 + n_val_args - voids)
842        -- The 1+ is for the function itself
843        -- Add 1 for each non-trivial arg;
844        -- the allocation cost, as in let(rec)
845
846-- | The size of a jump to a join point
847jumpSize
848 :: Int  -- ^ number of value args
849 -> Int  -- ^ number of value args that are void
850 -> Int
851jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
852  -- A jump is 20% the size of a function call. Making jumps free reopens
853  -- bug #6048, but making them any more expensive loses a 21% improvement in
854  -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
855  -- better solution?
856
857funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
858-- Size for functions that are not constructors or primops
859-- Note [Function applications]
860funSize dflags top_args fun n_val_args voids
861  | fun `hasKey` buildIdKey   = buildSize
862  | fun `hasKey` augmentIdKey = augmentSize
863  | otherwise = SizeIs size arg_discount res_discount
864  where
865    some_val_args = n_val_args > 0
866    is_join = isJoinId fun
867
868    size | is_join              = jumpSize n_val_args voids
869         | not some_val_args    = 0
870         | otherwise            = callSize n_val_args voids
871
872        --                  DISCOUNTS
873        --  See Note [Function and non-function discounts]
874    arg_discount | some_val_args && fun `elem` top_args
875                 = unitBag (fun, ufFunAppDiscount dflags)
876                 | otherwise = emptyBag
877        -- If the function is an argument and is applied
878        -- to some values, give it an arg-discount
879
880    res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
881                 | otherwise                = 0
882        -- If the function is partially applied, show a result discount
883-- XXX maybe behave like ConSize for eval'd variable
884
885conSize :: DataCon -> Int -> ExprSize
886conSize dc n_val_args
887  | n_val_args == 0 = SizeIs 0 emptyBag 10    -- Like variables
888
889-- See Note [Unboxed tuple size and result discount]
890  | isUnboxedTupleCon dc = SizeIs 0 emptyBag 10
891
892-- See Note [Constructor size and result discount]
893  | otherwise = SizeIs 10 emptyBag 10
894
895{- Note [Constructor size and result discount]
896~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
897Treat a constructors application as size 10, regardless of how many
898arguments it has; we are keen to expose them (and we charge separately
899for their args).  We can't treat them as size zero, else we find that
900(Just x) has size 0, which is the same as a lone variable; and hence
901'v' will always be replaced by (Just x), where v is bound to Just x.
902
903The "result discount" is applied if the result of the call is
904scrutinised (say by a case).  For a constructor application that will
905mean the constructor application will disappear, so we don't need to
906charge it to the function.  So the discount should at least match the
907cost of the constructor application, namely 10.
908
909Historical note 1: Until Jun 2020 we gave it a "bit of extra
910incentive" via a discount of 10*(1 + n_val_args), but that was FAR too
911much (#18282).  In particular, consider a huge case tree like
912
913   let r = case y1 of
914          Nothing -> B1 a b c
915          Just v1 -> case y2 of
916                      Nothing -> B1 c b a
917                      Just v2 -> ...
918
919If conSize gives a cost of 10 (regardless of n_val_args) and a
920discount of 10, that'll make each alternative RHS cost zero.  We
921charge 10 for each case alternative (see size_up_alt).  If we give a
922bigger discount (say 20) in conSize, we'll make the case expression
923cost *nothing*, and that can make a huge case tree cost nothing. This
924leads to massive, sometimes exponenial inlinings (#18282).  In short,
925don't give a discount that give a negative size to a sub-expression!
926
927Historical note 2: Much longer ago, Simon M tried a MUCH bigger
928discount: (10 * (10 + n_val_args)), and said it was an "unambiguous
929win", but its terribly dangerous because a function with many many
930case branches, each finishing with a constructor, can have an
931arbitrarily large discount.  This led to terrible code bloat: see
932#6099.
933
934Note [Unboxed tuple size and result discount]
935~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
936However, unboxed tuples count as size zero. I found occasions where we had
937        f x y z = case op# x y z of { s -> (# s, () #) }
938and f wasn't getting inlined.
939
940I tried giving unboxed tuples a *result discount* of zero (see the
941commented-out line).  Why?  When returned as a result they do not
942allocate, so maybe we don't want to charge so much for them. If you
943have a non-zero discount here, we find that workers often get inlined
944back into wrappers, because it look like
945    f x = case $wf x of (# a,b #) -> (a,b)
946and we are keener because of the case.  However while this change
947shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
948more. All other changes were very small. So it's not a big deal but I
949didn't adopt the idea.
950
951When fixing #18282 (see Note [Constructor size and result discount])
952I changed the result discount to be just 10, not 10*(1+n_val_args).
953
954Note [Function and non-function discounts]
955~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
956We want a discount if the function is applied. A good example is
957monadic combinators with continuation arguments, where inlining is
958quite important.
959
960But we don't want a big discount when a function is called many times
961(see the detailed comments with #6048) because if the function is
962big it won't be inlined at its many call sites and no benefit results.
963Indeed, we can get exponentially big inlinings this way; that is what
964#6048 is about.
965
966On the other hand, for data-valued arguments, if there are lots of
967case expressions in the body, each one will get smaller if we apply
968the function to a constructor application, so we *want* a big discount
969if the argument is scrutinised by many case expressions.
970
971Conclusion:
972  - For functions, take the max of the discounts
973  - For data values, take the sum of the discounts
974
975
976Note [Literal integer size]
977~~~~~~~~~~~~~~~~~~~~~~~~~~~
978Literal integers *can* be big (mkInteger [...coefficients...]), but
979need not be (IS n).  We just use an arbitrary big-ish constant here
980so that, in particular, we don't inline top-level defns like
981   n = IS 5
982There's no point in doing so -- any optimisations will see the IS
983through n's unfolding.  Nor will a big size inhibit unfoldings functions
984that mention a literal Integer, because the float-out pass will float
985all those constants to top level.
986-}
987
988primOpSize :: PrimOp -> Int -> ExprSize
989primOpSize op n_val_args
990 = if primOpOutOfLine op
991      then sizeN (op_size + n_val_args)
992      else sizeN op_size
993 where
994   op_size = primOpCodeSize op
995
996
997buildSize :: ExprSize
998buildSize = SizeIs 0 emptyBag 40
999        -- We really want to inline applications of build
1000        -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
1001        -- Indeed, we should add a result_discount because build is
1002        -- very like a constructor.  We don't bother to check that the
1003        -- build is saturated (it usually is).  The "-2" discounts for the \c n,
1004        -- The "4" is rather arbitrary.
1005
1006augmentSize :: ExprSize
1007augmentSize = SizeIs 0 emptyBag 40
1008        -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
1009        -- e plus ys. The -2 accounts for the \cn
1010
1011-- When we return a lambda, give a discount if it's used (applied)
1012lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
1013lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
1014lamScrutDiscount _      TooBig          = TooBig
1015
1016{-
1017Note [addAltSize result discounts]
1018~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1019When adding the size of alternatives, we *add* the result discounts
1020too, rather than take the *maximum*.  For a multi-branch case, this
1021gives a discount for each branch that returns a constructor, making us
1022keener to inline.  I did try using 'max' instead, but it makes nofib
1023'rewrite' and 'puzzle' allocate significantly more, and didn't make
1024binary sizes shrink significantly either.
1025
1026Note [Discounts and thresholds]
1027~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1028Constants for discounts and thesholds are defined in "GHC.Driver.Session",
1029all of form ufXxxx.   They are:
1030
1031ufCreationThreshold
1032     At a definition site, if the unfolding is bigger than this, we
1033     may discard it altogether
1034
1035ufUseThreshold
1036     At a call site, if the unfolding, less discounts, is smaller than
1037     this, then it's small enough inline
1038
1039ufDictDiscount
1040     The discount for each occurrence of a dictionary argument
1041     as an argument of a class method.  Should be pretty small
1042     else big functions may get inlined
1043
1044ufFunAppDiscount
1045     Discount for a function argument that is applied.  Quite
1046     large, because if we inline we avoid the higher-order call.
1047
1048ufDearOp
1049     The size of a foreign call or not-dupable PrimOp
1050
1051ufVeryAggressive
1052     If True, the compiler ignores all the thresholds and inlines very
1053     aggressively. It still adheres to arity, simplifier phase control and
1054     loop breakers.
1055
1056
1057Historical Note: Before April 2020 we had another factor,
1058ufKeenessFactor, which would scale the discounts before they were subtracted
1059from the size. This was justified with the following comment:
1060
1061  -- We multiply the raw discounts (args_discount and result_discount)
1062  -- ty opt_UnfoldingKeenessFactor because the former have to do with
1063  --  *size* whereas the discounts imply that there's some extra
1064  --  *efficiency* to be gained (e.g. beta reductions, case reductions)
1065  -- by inlining.
1066
1067However, this is highly suspect since it means that we subtract a *scaled* size
1068from an absolute size, resulting in crazy (e.g. negative) scores in some cases
1069(#15304). We consequently killed off ufKeenessFactor and bumped up the
1070ufUseThreshold to compensate.
1071
1072
1073Note [Function applications]
1074~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1075In a function application (f a b)
1076
1077  - If 'f' is an argument to the function being analysed,
1078    and there's at least one value arg, record a FunAppDiscount for f
1079
1080  - If the application if a PAP (arity > 2 in this example)
1081    record a *result* discount (because inlining
1082    with "extra" args in the call may mean that we now
1083    get a saturated application)
1084
1085Code for manipulating sizes
1086-}
1087
1088-- | The size of a candidate expression for unfolding
1089data ExprSize
1090    = TooBig
1091    | SizeIs { _es_size_is  :: {-# UNPACK #-} !Int -- ^ Size found
1092             , _es_args     :: !(Bag (Id,Int))
1093               -- ^ Arguments cased herein, and discount for each such
1094             , _es_discount :: {-# UNPACK #-} !Int
1095               -- ^ Size to subtract if result is scrutinised by a case
1096               -- expression
1097             }
1098
1099instance Outputable ExprSize where
1100  ppr TooBig         = text "TooBig"
1101  ppr (SizeIs a _ c) = brackets (int a <+> int c)
1102
1103-- subtract the discount before deciding whether to bale out. eg. we
1104-- want to inline a large constructor application into a selector:
1105--      tup = (a_1, ..., a_99)
1106--      x = case tup of ...
1107--
1108mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
1109mkSizeIs max n xs d | (n - d) > max = TooBig
1110                    | otherwise     = SizeIs n xs d
1111
1112maxSize :: ExprSize -> ExprSize -> ExprSize
1113maxSize TooBig         _                                  = TooBig
1114maxSize _              TooBig                             = TooBig
1115maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2   = s1
1116                                              | otherwise = s2
1117
1118sizeZero :: ExprSize
1119sizeN :: Int -> ExprSize
1120
1121sizeZero = SizeIs 0 emptyBag 0
1122sizeN n  = SizeIs n emptyBag 0
1123
1124{-
1125************************************************************************
1126*                                                                      *
1127\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
1128*                                                                      *
1129************************************************************************
1130
1131We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
1132we ``couldn't possibly use'' on the other side.  Can be overridden w/
1133flaggery.  Just the same as smallEnoughToInline, except that it has no
1134actual arguments.
1135-}
1136
1137couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1138couldBeSmallEnoughToInline dflags threshold rhs
1139  = case sizeExpr dflags threshold [] body of
1140       TooBig -> False
1141       _      -> True
1142  where
1143    (_, body) = collectBinders rhs
1144
1145----------------
1146smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1147smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1148  = size <= ufUseThreshold dflags
1149smallEnoughToInline _ _
1150  = False
1151
1152----------------
1153
1154certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1155-- ^ Sees if the unfolding is pretty certain to inline.
1156-- If so, return a *stable* unfolding for it, that will always inline.
1157certainlyWillInline dflags fn_info
1158  = case unfoldingInfo fn_info of
1159      CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1160        | loop_breaker -> Nothing      -- Won't inline, so try w/w
1161        | noinline     -> Nothing      -- See Note [Worker-wrapper for NOINLINE functions]
1162        | otherwise    -> do_cunf e g  -- Depends on size, so look at that
1163
1164      DFunUnfolding {} -> Just fn_unf  -- Don't w/w DFuns; it never makes sense
1165                                       -- to do so, and even if it is currently a
1166                                       -- loop breaker, it may not be later
1167
1168      _other_unf       -> Nothing
1169
1170  where
1171    loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1172    noinline     = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline
1173    fn_unf       = unfoldingInfo fn_info
1174
1175    do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1176    do_cunf _ UnfNever     = Nothing
1177    do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1178                             -- INLINE functions have UnfWhen
1179
1180        -- The UnfIfGoodArgs case seems important.  If we w/w small functions
1181        -- binary sizes go up by 10%!  (This is with SplitObjs.)
1182        -- I'm not totally sure why.
1183        -- INLINABLE functions come via this path
1184        --    See Note [certainlyWillInline: INLINABLE]
1185    do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1186      | arityInfo fn_info > 0  -- See Note [certainlyWillInline: be careful of thunks]
1187      , not (isDeadEndSig (strictnessInfo fn_info))
1188              -- Do not unconditionally inline a bottoming functions even if
1189              -- it seems smallish. We've carefully lifted it out to top level,
1190              -- so we don't want to re-inline it.
1191      , let unf_arity = length args
1192      , size - (10 * (unf_arity + 1)) <= ufUseThreshold dflags
1193      = Just (fn_unf { uf_src      = InlineStable
1194                     , uf_guidance = UnfWhen { ug_arity     = unf_arity
1195                                             , ug_unsat_ok  = unSaturatedOk
1196                                             , ug_boring_ok = inlineBoringOk expr } })
1197             -- Note the "unsaturatedOk". A function like  f = \ab. a
1198             -- will certainly inline, even if partially applied (f e), so we'd
1199             -- better make sure that the transformed inlining has the same property
1200      | otherwise
1201      = Nothing
1202
1203{- Note [certainlyWillInline: be careful of thunks]
1204~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1205Don't claim that thunks will certainly inline, because that risks work
1206duplication.  Even if the work duplication is not great (eg is_cheap
1207holds), it can make a big difference in an inner loop In #5623 we
1208found that the WorkWrap phase thought that
1209       y = case x of F# v -> F# (v +# v)
1210was certainlyWillInline, so the addition got duplicated.
1211
1212Note that we check arityInfo instead of the arity of the unfolding to detect
1213this case. This is so that we don't accidentally fail to inline small partial
1214applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2
1215(say). Here there is no risk of work duplication, and the RHS is tiny, so
1216certainlyWillInline should return True. But `unf_arity` is zero! However f's
1217arity, gotten from `arityInfo fn_info`, is 1.
1218
1219Failing to say that `f` will inline forces W/W to generate a potentially huge
1220worker for f that will immediately cancel with `g`'s wrapper anyway, causing
1221unnecessary churn in the Simplifier while arriving at the same result.
1222
1223Note [certainlyWillInline: INLINABLE]
1224~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1225certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1226even though we have a stable inlining, so that strictness w/w takes
1227place.  It makes a big difference to efficiency, and the w/w pass knows
1228how to transfer the INLINABLE info to the worker; see WorkWrap
1229Note [Worker-wrapper for INLINABLE functions]
1230
1231************************************************************************
1232*                                                                      *
1233\subsection{callSiteInline}
1234*                                                                      *
1235************************************************************************
1236
1237This is the key function.  It decides whether to inline a variable at a call site
1238
1239callSiteInline is used at call sites, so it is a bit more generous.
1240It's a very important function that embodies lots of heuristics.
1241A non-WHNF can be inlined if it doesn't occur inside a lambda,
1242and occurs exactly once or
1243    occurs once in each branch of a case and is small
1244
1245If the thing is in WHNF, there's no danger of duplicating work,
1246so we can inline if it occurs once, or is small
1247
1248NOTE: we don't want to inline top-level functions that always diverge.
1249It just makes the code bigger.  Tt turns out that the convenient way to prevent
1250them inlining is to give them a NOINLINE pragma, which we do in
1251StrictAnal.addStrictnessInfoToTopId
1252-}
1253
1254callSiteInline :: DynFlags
1255               -> Id                    -- The Id
1256               -> Bool                  -- True <=> unfolding is active
1257               -> Bool                  -- True if there are no arguments at all (incl type args)
1258               -> [ArgSummary]          -- One for each value arg; True if it is interesting
1259               -> CallCtxt              -- True <=> continuation is interesting
1260               -> Maybe CoreExpr        -- Unfolding, if any
1261
1262data ArgSummary = TrivArg       -- Nothing interesting
1263                | NonTrivArg    -- Arg has structure
1264                | ValueArg      -- Arg is a con-app or PAP
1265                                -- ..or con-like. Note [Conlike is interesting]
1266
1267instance Outputable ArgSummary where
1268  ppr TrivArg    = text "TrivArg"
1269  ppr NonTrivArg = text "NonTrivArg"
1270  ppr ValueArg   = text "ValueArg"
1271
1272nonTriv ::  ArgSummary -> Bool
1273nonTriv TrivArg = False
1274nonTriv _       = True
1275
1276data CallCtxt
1277  = BoringCtxt
1278  | RhsCtxt             -- Rhs of a let-binding; see Note [RHS of lets]
1279  | DiscArgCtxt         -- Argument of a function with non-zero arg discount
1280  | RuleArgCtxt         -- We are somewhere in the argument of a function with rules
1281
1282  | ValAppCtxt          -- We're applied to at least one value arg
1283                        -- This arises when we have ((f x |> co) y)
1284                        -- Then the (f x) has argument 'x' but in a ValAppCtxt
1285
1286  | CaseCtxt            -- We're the scrutinee of a case
1287                        -- that decomposes its scrutinee
1288
1289instance Outputable CallCtxt where
1290  ppr CaseCtxt    = text "CaseCtxt"
1291  ppr ValAppCtxt  = text "ValAppCtxt"
1292  ppr BoringCtxt  = text "BoringCtxt"
1293  ppr RhsCtxt     = text "RhsCtxt"
1294  ppr DiscArgCtxt = text "DiscArgCtxt"
1295  ppr RuleArgCtxt = text "RuleArgCtxt"
1296
1297callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1298  = case idUnfolding id of
1299      -- idUnfolding checks for loop-breakers, returning NoUnfolding
1300      -- Things with an INLINE pragma may have an unfolding *and*
1301      -- be a loop breaker  (maybe the knot is not yet untied)
1302        CoreUnfolding { uf_tmpl = unf_template
1303                      , uf_is_work_free = is_wf
1304                      , uf_guidance = guidance, uf_expandable = is_exp }
1305          | active_unfolding -> tryUnfolding dflags id lone_variable
1306                                    arg_infos cont_info unf_template
1307                                    is_wf is_exp guidance
1308          | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1309        NoUnfolding      -> Nothing
1310        BootUnfolding    -> Nothing
1311        OtherCon {}      -> Nothing
1312        DFunUnfolding {} -> Nothing     -- Never unfold a DFun
1313
1314-- | Report the inlining of an identifier's RHS to the user, if requested.
1315traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1316traceInline dflags inline_id str doc result
1317  -- We take care to ensure that doc is used in only one branch, ensuring that
1318  -- the simplifier can push its allocation into the branch. See Note [INLINE
1319  -- conditional tracing utilities].
1320  | enable    = traceAction dflags str doc result
1321  | otherwise = result
1322  where
1323    enable
1324      | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1325      = True
1326      | Just prefix <- inlineCheck dflags
1327      = prefix `isPrefixOf` occNameString (getOccName inline_id)
1328      | otherwise
1329      = False
1330{-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities]
1331
1332tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1333             -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1334             -> Maybe CoreExpr
1335tryUnfolding dflags id lone_variable
1336             arg_infos cont_info unf_template
1337             is_wf is_exp guidance
1338 = case guidance of
1339     UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1340
1341     UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1342        | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags)
1343                -- See Note [INLINE for small functions (3)]
1344        -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1345        | otherwise
1346        -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1347        where
1348          some_benefit = calc_some_benefit uf_arity
1349          enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1350
1351     UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1352        | ufVeryAggressive dflags
1353        -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1354        | is_wf && some_benefit && small_enough
1355        -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1356        | otherwise
1357        -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1358        where
1359          some_benefit = calc_some_benefit (length arg_discounts)
1360          extra_doc = text "discounted size =" <+> int discounted_size
1361          discounted_size = size - discount
1362          small_enough = discounted_size <= ufUseThreshold dflags
1363          discount = computeDiscount arg_discounts res_discount arg_infos cont_info
1364
1365  where
1366    mk_doc some_benefit extra_doc yes_or_no
1367      = vcat [ text "arg infos" <+> ppr arg_infos
1368             , text "interesting continuation" <+> ppr cont_info
1369             , text "some_benefit" <+> ppr some_benefit
1370             , text "is exp:" <+> ppr is_exp
1371             , text "is work-free:" <+> ppr is_wf
1372             , text "guidance" <+> ppr guidance
1373             , extra_doc
1374             , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1375
1376    str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1377    n_val_args = length arg_infos
1378
1379           -- some_benefit is used when the RHS is small enough
1380           -- and the call has enough (or too many) value
1381           -- arguments (ie n_val_args >= arity). But there must
1382           -- be *something* interesting about some argument, or the
1383           -- result context, to make it worth inlining
1384    calc_some_benefit :: Arity -> Bool   -- The Arity is the number of args
1385                                         -- expected by the unfolding
1386    calc_some_benefit uf_arity
1387       | not saturated = interesting_args       -- Under-saturated
1388                                        -- Note [Unsaturated applications]
1389       | otherwise = interesting_args   -- Saturated or over-saturated
1390                  || interesting_call
1391      where
1392        saturated      = n_val_args >= uf_arity
1393        over_saturated = n_val_args > uf_arity
1394        interesting_args = any nonTriv arg_infos
1395                -- NB: (any nonTriv arg_infos) looks at the
1396                -- over-saturated args too which is "wrong";
1397                -- but if over-saturated we inline anyway.
1398
1399        interesting_call
1400          | over_saturated
1401          = True
1402          | otherwise
1403          = case cont_info of
1404              CaseCtxt   -> not (lone_variable && is_exp)  -- Note [Lone variables]
1405              ValAppCtxt -> True                           -- Note [Cast then apply]
1406              RuleArgCtxt -> uf_arity > 0  -- See Note [Unfold info lazy contexts]
1407              DiscArgCtxt -> uf_arity > 0  -- Note [Inlining in ArgCtxt]
1408              RhsCtxt     -> uf_arity > 0  --
1409              _other      -> False         -- See Note [Nested functions]
1410
1411
1412{-
1413Note [Unfold into lazy contexts], Note [RHS of lets]
1414~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1415When the call is the argument of a function with a RULE, or the RHS of a let,
1416we are a little bit keener to inline.  For example
1417     f y = (y,y,y)
1418     g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1419We'd inline 'f' if the call was in a case context, and it kind-of-is,
1420only we can't see it.  Also
1421     x = f v
1422could be expensive whereas
1423     x = case v of (a,b) -> a
1424is patently cheap and may allow more eta expansion.
1425So we treat the RHS of a let as not-totally-boring.
1426
1427Note [Unsaturated applications]
1428~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1429When a call is not saturated, we *still* inline if one of the
1430arguments has interesting structure.  That's sometimes very important.
1431A good example is the Ord instance for Bool in Base:
1432
1433 Rec {
1434    $fOrdBool =GHC.Classes.D:Ord
1435                 @ Bool
1436                 ...
1437                 $cmin_ajX
1438
1439    $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1440    $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1441  }
1442
1443But the defn of GHC.Classes.$dmmin is:
1444
1445  $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1446    {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1447       Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1448                   case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1449                     GHC.Types.False -> y GHC.Types.True -> x }) -}
1450
1451We *really* want to inline $dmmin, even though it has arity 3, in
1452order to unravel the recursion.
1453
1454
1455Note [Things to watch]
1456~~~~~~~~~~~~~~~~~~~~~~
1457*   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1458    Assume x is exported, so not inlined unconditionally.
1459    Then we want x to inline unconditionally; no reason for it
1460    not to, and doing so avoids an indirection.
1461
1462*   { x = I# 3; ....f x.... }
1463    Make sure that x does not inline unconditionally!
1464    Lest we get extra allocation.
1465
1466Note [Inlining an InlineRule]
1467~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1468An InlineRules is used for
1469  (a) programmer INLINE pragmas
1470  (b) inlinings from worker/wrapper
1471
1472For (a) the RHS may be large, and our contract is that we *only* inline
1473when the function is applied to all the arguments on the LHS of the
1474source-code defn.  (The uf_arity in the rule.)
1475
1476However for worker/wrapper it may be worth inlining even if the
1477arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1478require saturation.
1479
1480Note [Nested functions]
1481~~~~~~~~~~~~~~~~~~~~~~~
1482At one time we treated a call of a non-top-level function as
1483"interesting" (regardless of how boring the context) in the hope
1484that inlining it would eliminate the binding, and its allocation.
1485Specifically, in the default case of interesting_call we had
1486   _other -> not is_top && uf_arity > 0
1487
1488But actually postInlineUnconditionally does some of this and overall
1489it makes virtually no difference to nofib.  So I simplified away this
1490special case
1491
1492Note [Cast then apply]
1493~~~~~~~~~~~~~~~~~~~~~~
1494Consider
1495   myIndex = __inline_me ( (/\a. <blah>) |> co )
1496   co :: (forall a. a -> a) ~ (forall a. T a)
1497     ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1498
1499We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1500no value arguments.  The ValAppCtxt gives it enough incentive to inline.
1501
1502Note [Inlining in ArgCtxt]
1503~~~~~~~~~~~~~~~~~~~~~~~~~~
1504The condition (arity > 0) here is very important, because otherwise
1505we end up inlining top-level stuff into useless places; eg
1506   x = I# 3#
1507   f = \y.  g x
1508This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1509and 20% to 'power'.
1510
1511At one stage I replaced this condition by 'True' (leading to the above
1512slow-down).  The motivation was test eyeball/inline1.hs; but that seems
1513to work ok now.
1514
1515NOTE: arguably, we should inline in ArgCtxt only if the result of the
1516call is at least CONLIKE.  At least for the cases where we use ArgCtxt
1517for the RHS of a 'let', we only profit from the inlining if we get a
1518CONLIKE thing (modulo lets).
1519
1520Note [Lone variables]   See also Note [Interaction of exprIsWorkFree and lone variables]
1521~~~~~~~~~~~~~~~~~~~~~   which appears below
1522The "lone-variable" case is important.  I spent ages messing about
1523with unsatisfactory variants, but this is nice.  The idea is that if a
1524variable appears all alone
1525
1526        as an arg of lazy fn, or rhs    BoringCtxt
1527        as scrutinee of a case          CaseCtxt
1528        as arg of a fn                  ArgCtxt
1529AND
1530        it is bound to a cheap expression
1531
1532then we should not inline it (unless there is some other reason,
1533e.g. it is the sole occurrence).  That is what is happening at
1534the use of 'lone_variable' in 'interesting_call'.
1535
1536Why?  At least in the case-scrutinee situation, turning
1537        let x = (a,b) in case x of y -> ...
1538into
1539        let x = (a,b) in case (a,b) of y -> ...
1540and thence to
1541        let x = (a,b) in let y = (a,b) in ...
1542is bad if the binding for x will remain.
1543
1544Another example: I discovered that strings
1545were getting inlined straight back into applications of 'error'
1546because the latter is strict.
1547        s = "foo"
1548        f = \x -> ...(error s)...
1549
1550Fundamentally such contexts should not encourage inlining because, provided
1551the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the
1552context can ``see'' the unfolding of the variable (e.g. case or a
1553RULE) so there's no gain.
1554
1555However, watch out:
1556
1557 * Consider this:
1558        foo = _inline_ (\n. [n])
1559        bar = _inline_ (foo 20)
1560        baz = \n. case bar of { (m:_) -> m + n }
1561   Here we really want to inline 'bar' so that we can inline 'foo'
1562   and the whole thing unravels as it should obviously do.  This is
1563   important: in the NDP project, 'bar' generates a closure data
1564   structure rather than a list.
1565
1566   So the non-inlining of lone_variables should only apply if the
1567   unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1568   looks through the unfolding.  Hence the "&& is_wf" in the
1569   InlineRule branch.
1570
1571 * Even a type application or coercion isn't a lone variable.
1572   Consider
1573        case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1574   We had better inline that sucker!  The case won't see through it.
1575
1576   For now, I'm treating treating a variable applied to types
1577   in a *lazy* context "lone". The motivating example was
1578        f = /\a. \x. BIG
1579        g = /\a. \y.  h (f a)
1580   There's no advantage in inlining f here, and perhaps
1581   a significant disadvantage.  Hence some_val_args in the Stop case
1582
1583Note [Interaction of exprIsWorkFree and lone variables]
1584~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1585The lone-variable test says "don't inline if a case expression
1586scrutinises a lone variable whose unfolding is cheap".  It's very
1587important that, under these circumstances, exprIsConApp_maybe
1588can spot a constructor application. So, for example, we don't
1589consider
1590        let x = e in (x,x)
1591to be cheap, and that's good because exprIsConApp_maybe doesn't
1592think that expression is a constructor application.
1593
1594In the 'not (lone_variable && is_wf)' test, I used to test is_value
1595rather than is_wf, which was utterly wrong, because the above
1596expression responds True to exprIsHNF, which is what sets is_value.
1597
1598This kind of thing can occur if you have
1599
1600        {-# INLINE foo #-}
1601        foo = let x = e in (x,x)
1602
1603which Roman did.
1604
1605
1606-}
1607
1608computeDiscount :: [Int] -> Int -> [ArgSummary] -> CallCtxt
1609                -> Int
1610computeDiscount arg_discounts res_discount arg_infos cont_info
1611
1612  = 10          -- Discount of 10 because the result replaces the call
1613                -- so we count 10 for the function itself
1614
1615    + 10 * length actual_arg_discounts
1616               -- Discount of 10 for each arg supplied,
1617               -- because the result replaces the call
1618
1619    + total_arg_discount + res_discount'
1620  where
1621    actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1622    total_arg_discount   = sum actual_arg_discounts
1623
1624    mk_arg_discount _        TrivArg    = 0
1625    mk_arg_discount _        NonTrivArg = 10
1626    mk_arg_discount discount ValueArg   = discount
1627
1628    res_discount'
1629      | LT <- arg_discounts `compareLength` arg_infos
1630      = res_discount   -- Over-saturated
1631      | otherwise
1632      = case cont_info of
1633           BoringCtxt  -> 0
1634           CaseCtxt    -> res_discount  -- Presumably a constructor
1635           ValAppCtxt  -> res_discount  -- Presumably a function
1636           _           -> 40 `min` res_discount
1637                -- ToDo: this 40 `min` res_discount doesn't seem right
1638                --   for DiscArgCtxt it shouldn't matter because the function will
1639                --       get the arg discount for any non-triv arg
1640                --   for RuleArgCtxt we do want to be keener to inline; but not only
1641                --       constructor results
1642                --   for RhsCtxt I suppose that exposing a data con is good in general
1643                --   And 40 seems very arbitrary
1644                --
1645                -- res_discount can be very large when a function returns
1646                -- constructors; but we only want to invoke that large discount
1647                -- when there's a case continuation.
1648                -- Otherwise we, rather arbitrarily, threshold it.  Yuk.
1649                -- But we want to avoid inlining large functions that return
1650                -- constructors into contexts that are simply "interesting"
1651