1{-
2(c) The University of Glasgow 2006
3(c) The AQUA Project, Glasgow University, 1994-1998
4
5
6Core-syntax unfoldings
7
8Unfoldings (which can travel across module boundaries) are in Core
9syntax (namely @CoreExpr@s).
10
11The type @Unfolding@ sits ``above'' simply-Core-expressions
12unfoldings, capturing ``higher-level'' things we know about a binding,
13usually things that the simplifier found out (e.g., ``it's a
14literal'').  In the corner of a @CoreUnfolding@ unfolding, you will
15find, unsurprisingly, a Core expression.
16-}
17
18{-# LANGUAGE CPP #-}
19
20module CoreUnfold (
21        Unfolding, UnfoldingGuidance,   -- Abstract types
22
23        noUnfolding, mkImplicitUnfolding,
24        mkUnfolding, mkCoreUnfolding,
25        mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
26        mkInlineUnfolding, mkInlineUnfoldingWithArity,
27        mkInlinableUnfolding, mkWwInlineRule,
28        mkCompulsoryUnfolding, mkDFunUnfolding,
29        specUnfolding,
30
31        ArgSummary(..),
32
33        couldBeSmallEnoughToInline, inlineBoringOk,
34        certainlyWillInline, smallEnoughToInline,
35
36        callSiteInline, CallCtxt(..),
37
38        -- Reexport from CoreSubst (it only live there so it can be used
39        -- by the Very Simple Optimiser)
40        exprIsConApp_maybe, exprIsLiteral_maybe
41    ) where
42
43#include "HsVersions.h"
44
45import GhcPrelude
46
47import DynFlags
48import CoreSyn
49import OccurAnal        ( occurAnalyseExpr_NoBinderSwap )
50import CoreOpt
51import CoreArity       ( manifestArity )
52import CoreUtils
53import Id
54import Demand          ( isBottomingSig )
55import DataCon
56import Literal
57import PrimOp
58import IdInfo
59import BasicTypes       ( Arity, InlineSpec(..), inlinePragmaSpec )
60import Type
61import PrelNames
62import TysPrim          ( realWorldStatePrimTy )
63import Bag
64import Util
65import Outputable
66import ForeignCall
67import Name
68
69import qualified Data.ByteString as BS
70import Data.List
71
72{-
73************************************************************************
74*                                                                      *
75\subsection{Making unfoldings}
76*                                                                      *
77************************************************************************
78-}
79
80mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
81mkTopUnfolding dflags is_bottoming rhs
82  = mkUnfolding dflags InlineRhs True is_bottoming rhs
83
84mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
85-- For implicit Ids, do a tiny bit of optimising first
86mkImplicitUnfolding dflags expr
87  = mkTopUnfolding dflags False (simpleOptExpr dflags expr)
88
89-- Note [Top-level flag on inline rules]
90-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
91-- Slight hack: note that mk_inline_rules conservatively sets the
92-- top-level flag to True.  It gets set more accurately by the simplifier
93-- Simplify.simplUnfolding.
94
95mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
96mkSimpleUnfolding dflags rhs
97  = mkUnfolding dflags InlineRhs False False rhs
98
99mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
100mkDFunUnfolding bndrs con ops
101  = DFunUnfolding { df_bndrs = bndrs
102                  , df_con = con
103                  , df_args = map occurAnalyseExpr_NoBinderSwap ops }
104                  -- See Note [Occurrence analysis of unfoldings]
105
106mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding
107mkWwInlineRule dflags expr arity
108  = mkCoreUnfolding InlineStable True
109                   (simpleOptExpr dflags expr)
110                   (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
111                            , ug_boring_ok = boringCxtNotOk })
112
113mkCompulsoryUnfolding :: CoreExpr -> Unfolding
114mkCompulsoryUnfolding expr         -- Used for things that absolutely must be unfolded
115  = mkCoreUnfolding InlineCompulsory True
116                    (simpleOptExpr unsafeGlobalDynFlags expr)
117                    (UnfWhen { ug_arity = 0    -- Arity of unfolding doesn't matter
118                             , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
119
120mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
121-- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
122mkWorkerUnfolding dflags work_fn
123                  (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
124                                 , uf_is_top = top_lvl })
125  | isStableSource src
126  = mkCoreUnfolding src top_lvl new_tmpl guidance
127  where
128    new_tmpl = simpleOptExpr dflags (work_fn tmpl)
129    guidance = calcUnfoldingGuidance dflags False new_tmpl
130
131mkWorkerUnfolding _ _ _ = noUnfolding
132
133-- | Make an unfolding that may be used unsaturated
134-- (ug_unsat_ok = unSaturatedOk) and that is reported as having its
135-- manifest arity (the number of outer lambdas applications will
136-- resolve before doing any work).
137mkInlineUnfolding :: CoreExpr -> Unfolding
138mkInlineUnfolding expr
139  = mkCoreUnfolding InlineStable
140                    True         -- Note [Top-level flag on inline rules]
141                    expr' guide
142  where
143    expr' = simpleOptExpr unsafeGlobalDynFlags expr
144    guide = UnfWhen { ug_arity = manifestArity expr'
145                    , ug_unsat_ok = unSaturatedOk
146                    , ug_boring_ok = boring_ok }
147    boring_ok = inlineBoringOk expr'
148
149-- | Make an unfolding that will be used once the RHS has been saturated
150-- to the given arity.
151mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding
152mkInlineUnfoldingWithArity arity expr
153  = mkCoreUnfolding InlineStable
154                    True         -- Note [Top-level flag on inline rules]
155                    expr' guide
156  where
157    expr' = simpleOptExpr unsafeGlobalDynFlags expr
158    guide = UnfWhen { ug_arity = arity
159                    , ug_unsat_ok = needSaturated
160                    , ug_boring_ok = boring_ok }
161    -- See Note [INLINE pragmas and boring contexts] as to why we need to look
162    -- at the arity here.
163    boring_ok | arity == 0 = True
164              | otherwise  = inlineBoringOk expr'
165
166mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
167mkInlinableUnfolding dflags expr
168  = mkUnfolding dflags InlineStable False False expr'
169  where
170    expr' = simpleOptExpr dflags expr
171
172specUnfolding :: DynFlags
173              -> [Var] -> (CoreExpr -> CoreExpr)
174              -> [CoreArg]   -- LHS arguments in the RULE
175              -> Unfolding -> Unfolding
176-- See Note [Specialising unfoldings]
177-- specUnfolding spec_bndrs spec_args unf
178--   = \spec_bndrs. unf spec_args
179--
180specUnfolding dflags spec_bndrs spec_app rule_lhs_args
181              df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args })
182  = ASSERT2( rule_lhs_args `equalLength` old_bndrs
183           , ppr df $$ ppr rule_lhs_args )
184           -- For this ASSERT see Note [DFunUnfoldings] in GHC.Core.Opt.Specialise
185    mkDFunUnfolding spec_bndrs con (map spec_arg args)
186      -- For DFunUnfoldings we transform
187      --       \obs. MkD <op1> ... <opn>
188      -- to
189      --       \sbs. MkD ((\obs. <op1>) spec_args) ... ditto <opn>
190  where
191    spec_arg arg = simpleOptExpr dflags $
192                   spec_app (mkLams old_bndrs arg)
193                   -- The beta-redexes created by spec_app will be
194                   -- simplified away by simplOptExpr
195
196specUnfolding dflags spec_bndrs spec_app rule_lhs_args
197              (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
198                             , uf_is_top = top_lvl
199                             , uf_guidance = old_guidance })
200 | isStableSource src  -- See Note [Specialising unfoldings]
201 , UnfWhen { ug_arity     = old_arity } <- old_guidance
202 = mkCoreUnfolding src top_lvl new_tmpl
203                   (old_guidance { ug_arity = old_arity - arity_decrease })
204 where
205   new_tmpl = simpleOptExpr dflags $
206              mkLams spec_bndrs    $
207              spec_app tmpl  -- The beta-redexes created by spec_app
208                             -- will besimplified away by simplOptExpr
209   arity_decrease = count isValArg rule_lhs_args - count isId spec_bndrs
210
211
212specUnfolding _ _ _ _ _ = noUnfolding
213
214{- Note [Specialising unfoldings]
215~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
216When we specialise a function for some given type-class arguments, we use
217specUnfolding to specialise its unfolding.  Some important points:
218
219* If the original function has a DFunUnfolding, the specialised one
220  must do so too!  Otherwise we lose the magic rules that make it
221  interact with ClassOps
222
223* There is a bit of hack for INLINABLE functions:
224     f :: Ord a => ....
225     f = <big-rhs>
226     {- INLINABLE f #-}
227  Now if we specialise f, should the specialised version still have
228  an INLINABLE pragma?  If it does, we'll capture a specialised copy
229  of <big-rhs> as its unfolding, and that probaby won't inline.  But
230  if we don't, the specialised version of <big-rhs> might be small
231  enough to inline at a call site. This happens with Control.Monad.liftM3,
232  and can cause a lot more allocation as a result (nofib n-body shows this).
233
234  Moreover, keeping the INLINABLE thing isn't much help, because
235  the specialised function (probaby) isn't overloaded any more.
236
237  Conclusion: drop the INLINEALE pragma.  In practice what this means is:
238     if a stable unfolding has UnfoldingGuidance of UnfWhen,
239        we keep it (so the specialised thing too will always inline)
240     if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
241        (which arises from INLINABLE), we discard it
242
243Note [Honour INLINE on 0-ary bindings]
244~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
245Consider
246
247   x = <expensive>
248   {-# INLINE x #-}
249
250   f y = ...x...
251
252The semantics of an INLINE pragma is
253
254  inline x at every call site, provided it is saturated;
255  that is, applied to at least as many arguments as appear
256  on the LHS of the Haskell source definition.
257
258(This soure-code-derived arity is stored in the `ug_arity` field of
259the `UnfoldingGuidance`.)
260
261In the example, x's ug_arity is 0, so we should inline it at every use
262site.  It's rare to have such an INLINE pragma (usually INLINE Is on
263functions), but it's occasionally very important (#15578, #15519).
264In #15519 we had something like
265   x = case (g a b) of I# r -> T r
266   {-# INLINE x #-}
267   f y = ...(h x)....
268
269where h is strict.  So we got
270   f y = ...(case g a b of I# r -> h (T r))...
271
272and that in turn allowed SpecConstr to ramp up performance.
273
274How do we deliver on this?  By adjusting the ug_boring_ok
275flag in mkInlineUnfoldingWithArity; see
276Note [INLINE pragmas and boring contexts]
277
278NB: there is a real risk that full laziness will float it right back
279out again. Consider again
280  x = factorial 200
281  {-# INLINE x #-}
282  f y = ...x...
283
284After inlining we get
285  f y = ...(factorial 200)...
286
287but it's entirely possible that full laziness will do
288  lvl23 = factorial 200
289  f y = ...lvl23...
290
291That's a problem for another day.
292
293Note [INLINE pragmas and boring contexts]
294~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
295An INLINE pragma uses mkInlineUnfoldingWithArity to build the
296unfolding.  That sets the ug_boring_ok flag to False if the function
297is not tiny (inlineBoringOK), so that even INLINE functions are not
298inlined in an utterly boring context.  E.g.
299     \x y. Just (f y x)
300Nothing is gained by inlining f here, even if it has an INLINE
301pragma.
302
303But for 0-ary bindings, we want to inline regardless; see
304Note [Honour INLINE on 0-ary bindings].
305
306I'm a bit worried that it's possible for the same kind of problem
307to arise for non-0-ary functions too, but let's wait and see.
308-}
309
310mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
311                -> UnfoldingGuidance -> Unfolding
312-- Occurrence-analyses the expression before capturing it
313mkCoreUnfolding src top_lvl expr guidance
314  = CoreUnfolding { uf_tmpl         = occurAnalyseExpr_NoBinderSwap expr,
315                      -- See Note [Occurrence analysis of unfoldings]
316                    uf_src          = src,
317                    uf_is_top       = top_lvl,
318                    uf_is_value     = exprIsHNF        expr,
319                    uf_is_conlike   = exprIsConLike    expr,
320                    uf_is_work_free = exprIsWorkFree   expr,
321                    uf_expandable   = exprIsExpandable expr,
322                    uf_guidance     = guidance }
323
324mkUnfolding :: DynFlags -> UnfoldingSource
325            -> Bool       -- Is top-level
326            -> Bool       -- Definitely a bottoming binding
327                          -- (only relevant for top-level bindings)
328            -> CoreExpr
329            -> Unfolding
330-- Calculates unfolding guidance
331-- Occurrence-analyses the expression before capturing it
332mkUnfolding dflags src is_top_lvl is_bottoming expr
333  = CoreUnfolding { uf_tmpl         = occurAnalyseExpr_NoBinderSwap expr,
334                      -- See Note [Occurrence analysis of unfoldings]
335                    uf_src          = src,
336                    uf_is_top       = is_top_lvl,
337                    uf_is_value     = exprIsHNF        expr,
338                    uf_is_conlike   = exprIsConLike    expr,
339                    uf_expandable   = exprIsExpandable expr,
340                    uf_is_work_free = exprIsWorkFree   expr,
341                    uf_guidance     = guidance }
342  where
343    is_top_bottoming = is_top_lvl && is_bottoming
344    guidance         = calcUnfoldingGuidance dflags is_top_bottoming expr
345        -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr_NoBinderSwap expr))!
346        -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
347
348{-
349Note [Occurrence analysis of unfoldings]
350~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
351We do occurrence-analysis of unfoldings once and for all, when the
352unfolding is built, rather than each time we inline them.
353
354But given this decision it's vital that we do
355*always* do it.  Consider this unfolding
356    \x -> letrec { f = ...g...; g* = f } in body
357where g* is (for some strange reason) the loop breaker.  If we don't
358occ-anal it when reading it in, we won't mark g as a loop breaker, and
359we may inline g entirely in body, dropping its binding, and leaving
360the occurrence in f out of scope. This happened in #8892, where
361the unfolding in question was a DFun unfolding.
362
363But more generally, the simplifier is designed on the
364basis that it is looking at occurrence-analysed expressions, so better
365ensure that they acutally are.
366
367We use occurAnalyseExpr_NoBinderSwap instead of occurAnalyseExpr;
368see Note [No binder swap in unfoldings].
369
370Note [No binder swap in unfoldings]
371~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
372The binder swap can temporarily violate Core Lint, by assinging
373a LocalId binding to a GlobalId. For example, if A.foo{r872}
374is a GlobalId with unique r872, then
375
376 case A.foo{r872} of bar {
377   K x -> ...(A.foo{r872})...
378 }
379
380gets transformed to
381
382  case A.foo{r872} of bar {
383    K x -> let foo{r872} = bar
384           in ...(A.foo{r872})...
385
386This is usually not a problem, because the simplifier will transform
387this to:
388
389  case A.foo{r872} of bar {
390    K x -> ...(bar)...
391
392However, after occurrence analysis but before simplification, this extra 'let'
393violates the Core Lint invariant that we do not have local 'let' bindings for
394GlobalIds.  That seems (just) tolerable for the occurrence analysis that happens
395just before the Simplifier, but not for unfoldings, which are Linted
396independently.
397As a quick workaround, we disable binder swap in this module.
398See #16288 and #16296 for further plans.
399
400Note [Calculate unfolding guidance on the non-occ-anal'd expression]
401~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
402Notice that we give the non-occur-analysed expression to
403calcUnfoldingGuidance.  In some ways it'd be better to occur-analyse
404first; for example, sometimes during simplification, there's a large
405let-bound thing which has been substituted, and so is now dead; so
406'expr' contains two copies of the thing while the occurrence-analysed
407expression doesn't.
408
409Nevertheless, we *don't* and *must not* occ-analyse before computing
410the size because
411
412a) The size computation bales out after a while, whereas occurrence
413   analysis does not.
414
415b) Residency increases sharply if you occ-anal first.  I'm not
416   100% sure why, but it's a large effect.  Compiling Cabal went
417   from residency of 534M to over 800M with this one change.
418
419This can occasionally mean that the guidance is very pessimistic;
420it gets fixed up next round.  And it should be rare, because large
421let-bound things that are dead are usually caught by preInlineUnconditionally
422
423
424************************************************************************
425*                                                                      *
426\subsection{The UnfoldingGuidance type}
427*                                                                      *
428************************************************************************
429-}
430
431inlineBoringOk :: CoreExpr -> Bool
432-- See Note [INLINE for small functions]
433-- True => the result of inlining the expression is
434--         no bigger than the expression itself
435--     eg      (\x y -> f y x)
436-- This is a quick and dirty version. It doesn't attempt
437-- to deal with  (\x y z -> x (y z))
438-- The really important one is (x `cast` c)
439inlineBoringOk e
440  = go 0 e
441  where
442    go :: Int -> CoreExpr -> Bool
443    go credit (Lam x e) | isId x           = go (credit+1) e
444                        | otherwise        = go credit e
445        -- See Note [Count coercion arguments in boring contexts]
446    go credit (App f (Type {}))            = go credit f
447    go credit (App f a) | credit > 0
448                        , exprIsTrivial a  = go (credit-1) f
449    go credit (Tick _ e)                   = go credit e -- dubious
450    go credit (Cast e _)                   = go credit e
451    go _      (Var {})                     = boringCxtOk
452    go _      _                            = boringCxtNotOk
453
454calcUnfoldingGuidance
455        :: DynFlags
456        -> Bool          -- Definitely a top-level, bottoming binding
457        -> CoreExpr      -- Expression to look at
458        -> UnfoldingGuidance
459calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr)
460  | not (tickishIsCode t)  -- non-code ticks don't matter for unfolding
461  = calcUnfoldingGuidance dflags is_top_bottoming expr
462calcUnfoldingGuidance dflags is_top_bottoming expr
463  = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of
464      TooBig -> UnfNever
465      SizeIs size cased_bndrs scrut_discount
466        | uncondInline expr n_val_bndrs size
467        -> UnfWhen { ug_unsat_ok = unSaturatedOk
468                   , ug_boring_ok =  boringCxtOk
469                   , ug_arity = n_val_bndrs }   -- Note [INLINE for small functions]
470
471        | is_top_bottoming
472        -> UnfNever   -- See Note [Do not inline top-level bottoming functions]
473
474        | otherwise
475        -> UnfIfGoodArgs { ug_args  = map (mk_discount cased_bndrs) val_bndrs
476                         , ug_size  = size
477                         , ug_res   = scrut_discount }
478
479  where
480    (bndrs, body) = collectBinders expr
481    bOMB_OUT_SIZE = ufCreationThreshold dflags
482           -- Bomb out if size gets bigger than this
483    val_bndrs   = filter isId bndrs
484    n_val_bndrs = length val_bndrs
485
486    mk_discount :: Bag (Id,Int) -> Id -> Int
487    mk_discount cbs bndr = foldl' combine 0 cbs
488           where
489             combine acc (bndr', disc)
490               | bndr == bndr' = acc `plus_disc` disc
491               | otherwise     = acc
492
493             plus_disc :: Int -> Int -> Int
494             plus_disc | isFunTy (idType bndr) = max
495                       | otherwise             = (+)
496             -- See Note [Function and non-function discounts]
497
498{-
499Note [Computing the size of an expression]
500~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
501The basic idea of sizeExpr is obvious enough: count nodes.  But getting the
502heuristics right has taken a long time.  Here's the basic strategy:
503
504    * Variables, literals: 0
505      (Exception for string literals, see litSize.)
506
507    * Function applications (f e1 .. en): 1 + #value args
508
509    * Constructor applications: 1, regardless of #args
510
511    * Let(rec): 1 + size of components
512
513    * Note, cast: 0
514
515Examples
516
517  Size  Term
518  --------------
519    0     42#
520    0     x
521    0     True
522    2     f x
523    1     Just x
524    4     f (g x)
525
526Notice that 'x' counts 0, while (f x) counts 2.  That's deliberate: there's
527a function call to account for.  Notice also that constructor applications
528are very cheap, because exposing them to a caller is so valuable.
529
530[25/5/11] All sizes are now multiplied by 10, except for primops
531(which have sizes like 1 or 4.  This makes primops look fantastically
532cheap, and seems to be almost unversally beneficial.  Done partly as a
533result of #4978.
534
535Note [Do not inline top-level bottoming functions]
536~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
537The FloatOut pass has gone to some trouble to float out calls to 'error'
538and similar friends.  See Note [Bottoming floats] in SetLevels.
539Do not re-inline them!  But we *do* still inline if they are very small
540(the uncondInline stuff).
541
542Note [INLINE for small functions]
543~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
544Consider        {-# INLINE f #-}
545                f x = Just x
546                g y = f y
547Then f's RHS is no larger than its LHS, so we should inline it into
548even the most boring context.  In general, f the function is
549sufficiently small that its body is as small as the call itself, the
550inline unconditionally, regardless of how boring the context is.
551
552Things to note:
553
554(1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
555    than the thing it's replacing.  Notice that
556      (f x) --> (g 3)             -- YES, unconditionally
557      (f x) --> x : []            -- YES, *even though* there are two
558                                  --      arguments to the cons
559      x     --> g 3               -- NO
560      x     --> Just v            -- NO
561
562    It's very important not to unconditionally replace a variable by
563    a non-atomic term.
564
565(2) We do this even if the thing isn't saturated, else we end up with the
566    silly situation that
567       f x y = x
568       ...map (f 3)...
569    doesn't inline.  Even in a boring context, inlining without being
570    saturated will give a lambda instead of a PAP, and will be more
571    efficient at runtime.
572
573(3) However, when the function's arity > 0, we do insist that it
574    has at least one value argument at the call site.  (This check is
575    made in the UnfWhen case of callSiteInline.) Otherwise we find this:
576         f = /\a \x:a. x
577         d = /\b. MkD (f b)
578    If we inline f here we get
579         d = /\b. MkD (\x:b. x)
580    and then prepareRhs floats out the argument, abstracting the type
581    variables, so we end up with the original again!
582
583(4) We must be much more cautious about arity-zero things. Consider
584       let x = y +# z in ...
585    In *size* terms primops look very small, because the generate a
586    single instruction, but we do not want to unconditionally replace
587    every occurrence of x with (y +# z).  So we only do the
588    unconditional-inline thing for *trivial* expressions.
589
590    NB: you might think that PostInlineUnconditionally would do this
591    but it doesn't fire for top-level things; see SimplUtils
592    Note [Top level and postInlineUnconditionally]
593
594Note [Count coercion arguments in boring contexts]
595~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
596In inlineBoringOK, we ignore type arguments when deciding whether an
597expression is okay to inline into boring contexts. This is good, since
598if we have a definition like
599
600  let y = x @Int in f y y
601
602there’s no reason not to inline y at both use sites — no work is
603actually duplicated. It may seem like the same reasoning applies to
604coercion arguments, and indeed, in #17182 we changed inlineBoringOK to
605treat coercions the same way.
606
607However, this isn’t a good idea: unlike type arguments, which have
608no runtime representation, coercion arguments *do* have a runtime
609representation (albeit the zero-width VoidRep, see Note [Coercion tokens]
610in CoreToStg.hs). This caused trouble in #17787 for DataCon wrappers for
611nullary GADT constructors: the wrappers would be inlined and each use of
612the constructor would lead to a separate allocation instead of just
613sharing the wrapper closure.
614
615The solution: don’t ignore coercion arguments after all.
616-}
617
618uncondInline :: CoreExpr -> Arity -> Int -> Bool
619-- Inline unconditionally if there no size increase
620-- Size of call is arity (+1 for the function)
621-- See Note [INLINE for small functions]
622uncondInline rhs arity size
623  | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
624  | otherwise = exprIsTrivial rhs        -- See Note [INLINE for small functions] (4)
625
626sizeExpr :: DynFlags
627         -> Int             -- Bomb out if it gets bigger than this
628         -> [Id]            -- Arguments; we're interested in which of these
629                            -- get case'd
630         -> CoreExpr
631         -> ExprSize
632
633-- Note [Computing the size of an expression]
634
635sizeExpr dflags bOMB_OUT_SIZE top_args expr
636  = size_up expr
637  where
638    size_up (Cast e _) = size_up e
639    size_up (Tick _ e) = size_up e
640    size_up (Type _)   = sizeZero           -- Types cost nothing
641    size_up (Coercion _) = sizeZero
642    size_up (Lit lit)  = sizeN (litSize lit)
643    size_up (Var f) | isRealWorldId f = sizeZero
644                      -- Make sure we get constructor discounts even
645                      -- on nullary constructors
646                    | otherwise       = size_up_call f [] 0
647
648    size_up (App fun arg)
649      | isTyCoArg arg = size_up fun
650      | otherwise     = size_up arg  `addSizeNSD`
651                        size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
652
653    size_up (Lam b e)
654      | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
655      | otherwise = size_up e
656
657    size_up (Let (NonRec binder rhs) body)
658      = size_up_rhs (binder, rhs) `addSizeNSD`
659        size_up body              `addSizeN`
660        size_up_alloc binder
661
662    size_up (Let (Rec pairs) body)
663      = foldr (addSizeNSD . size_up_rhs)
664              (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs))
665              pairs
666
667    size_up (Case e _ _ alts)
668        | null alts
669        = size_up e    -- case e of {} never returns, so take size of scrutinee
670
671    size_up (Case e _ _ alts)
672        -- Now alts is non-empty
673        | Just v <- is_top_arg e -- We are scrutinising an argument variable
674        = let
675            alt_sizes = map size_up_alt alts
676
677                  -- alts_size tries to compute a good discount for
678                  -- the case when we are scrutinising an argument variable
679            alts_size (SizeIs tot tot_disc tot_scrut)
680                          -- Size of all alternatives
681                      (SizeIs max _        _)
682                          -- Size of biggest alternative
683                  = SizeIs tot (unitBag (v, 20 + tot - max)
684                      `unionBags` tot_disc) tot_scrut
685                          -- If the variable is known, we produce a
686                          -- discount that will take us back to 'max',
687                          -- the size of the largest alternative The
688                          -- 1+ is a little discount for reduced
689                          -- allocation in the caller
690                          --
691                          -- Notice though, that we return tot_disc,
692                          -- the total discount from all branches.  I
693                          -- think that's right.
694
695            alts_size tot_size _ = tot_size
696          in
697          alts_size (foldr1 addAltSize alt_sizes)  -- alts is non-empty
698                    (foldr1 maxSize    alt_sizes)
699                -- Good to inline if an arg is scrutinised, because
700                -- that may eliminate allocation in the caller
701                -- And it eliminates the case itself
702        where
703          is_top_arg (Var v) | v `elem` top_args = Just v
704          is_top_arg (Cast e _) = is_top_arg e
705          is_top_arg _ = Nothing
706
707
708    size_up (Case e _ _ alts) = size_up e  `addSizeNSD`
709                                foldr (addAltSize . size_up_alt) case_size alts
710      where
711          case_size
712           | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10)
713           | otherwise = sizeZero
714                -- Normally we don't charge for the case itself, but
715                -- we charge one per alternative (see size_up_alt,
716                -- below) to account for the cost of the info table
717                -- and comparisons.
718                --
719                -- However, in certain cases (see is_inline_scrut
720                -- below), no code is generated for the case unless
721                -- there are multiple alts.  In these cases we
722                -- subtract one, making the first alt free.
723                -- e.g. case x# +# y# of _ -> ...   should cost 1
724                --      case touch# x# of _ -> ...  should cost 0
725                -- (see #4978)
726                --
727                -- I would like to not have the "lengthAtMost alts 1"
728                -- condition above, but without that some programs got worse
729                -- (spectral/hartel/event and spectral/para).  I don't fully
730                -- understand why. (SDM 24/5/11)
731
732                -- unboxed variables, inline primops and unsafe foreign calls
733                -- are all "inline" things:
734          is_inline_scrut (Var v) = isUnliftedType (idType v)
735          is_inline_scrut scrut
736              | (Var f, _) <- collectArgs scrut
737                = case idDetails f of
738                    FCallId fc  -> not (isSafeForeignCall fc)
739                    PrimOpId op -> not (primOpOutOfLine op)
740                    _other      -> False
741              | otherwise
742                = False
743
744    size_up_rhs (bndr, rhs)
745      | Just join_arity <- isJoinId_maybe bndr
746        -- Skip arguments to join point
747      , (_bndrs, body) <- collectNBinders join_arity rhs
748      = size_up body
749      | otherwise
750      = size_up rhs
751
752    ------------
753    -- size_up_app is used when there's ONE OR MORE value args
754    size_up_app (App fun arg) args voids
755        | isTyCoArg arg                  = size_up_app fun args voids
756        | isRealWorldExpr arg            = size_up_app fun (arg:args) (voids + 1)
757        | otherwise                      = size_up arg  `addSizeNSD`
758                                           size_up_app fun (arg:args) voids
759    size_up_app (Var fun)     args voids = size_up_call fun args voids
760    size_up_app (Tick _ expr) args voids = size_up_app expr args voids
761    size_up_app (Cast expr _) args voids = size_up_app expr args voids
762    size_up_app other         args voids = size_up other `addSizeN`
763                                           callSize (length args) voids
764       -- if the lhs is not an App or a Var, or an invisible thing like a
765       -- Tick or Cast, then we should charge for a complete call plus the
766       -- size of the lhs itself.
767
768    ------------
769    size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
770    size_up_call fun val_args voids
771       = case idDetails fun of
772           FCallId _        -> sizeN (callSize (length val_args) voids)
773           DataConWorkId dc -> conSize    dc (length val_args)
774           PrimOpId op      -> primOpSize op (length val_args)
775           ClassOpId _      -> classOpSize dflags top_args val_args
776           _                -> funSize dflags top_args fun (length val_args) voids
777
778    ------------
779    size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
780        -- Don't charge for args, so that wrappers look cheap
781        -- (See comments about wrappers with Case)
782        --
783        -- IMPORTANT: *do* charge 1 for the alternative, else we
784        -- find that giant case nests are treated as practically free
785        -- A good example is Foreign.C.Error.errnoToIOError
786
787    ------------
788    -- Cost to allocate binding with given binder
789    size_up_alloc bndr
790      |  isTyVar bndr                 -- Doesn't exist at runtime
791      || isJoinId bndr                -- Not allocated at all
792      || isUnliftedType (idType bndr) -- Doesn't live in heap
793      = 0
794      | otherwise
795      = 10
796
797    ------------
798        -- These addSize things have to be here because
799        -- I don't want to give them bOMB_OUT_SIZE as an argument
800    addSizeN TooBig          _  = TooBig
801    addSizeN (SizeIs n xs d) m  = mkSizeIs bOMB_OUT_SIZE (n + m) xs d
802
803        -- addAltSize is used to add the sizes of case alternatives
804    addAltSize TooBig            _      = TooBig
805    addAltSize _                 TooBig = TooBig
806    addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
807        = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
808                                 (xs `unionBags` ys)
809                                 (d1 + d2) -- Note [addAltSize result discounts]
810
811        -- This variant ignores the result discount from its LEFT argument
812        -- It's used when the second argument isn't part of the result
813    addSizeNSD TooBig            _      = TooBig
814    addSizeNSD _                 TooBig = TooBig
815    addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
816        = mkSizeIs bOMB_OUT_SIZE (n1 + n2)
817                                 (xs `unionBags` ys)
818                                 d2  -- Ignore d1
819
820    isRealWorldId id = idType id `eqType` realWorldStatePrimTy
821
822    -- an expression of type State# RealWorld must be a variable
823    isRealWorldExpr (Var id)   = isRealWorldId id
824    isRealWorldExpr (Tick _ e) = isRealWorldExpr e
825    isRealWorldExpr _          = False
826
827-- | Finds a nominal size of a string literal.
828litSize :: Literal -> Int
829-- Used by CoreUnfold.sizeExpr
830litSize (LitNumber LitNumInteger _ _) = 100   -- Note [Size of literal integers]
831litSize (LitNumber LitNumNatural _ _) = 100
832litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4)
833        -- If size could be 0 then @f "x"@ might be too small
834        -- [Sept03: make literal strings a bit bigger to avoid fruitless
835        --  duplication of little strings]
836litSize _other = 0    -- Must match size of nullary constructors
837                      -- Key point: if  x |-> 4, then x must inline unconditionally
838                      --            (eg via case binding)
839
840classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
841-- See Note [Conlike is interesting]
842classOpSize _ _ []
843  = sizeZero
844classOpSize dflags top_args (arg1 : other_args)
845  = SizeIs size arg_discount 0
846  where
847    size = 20 + (10 * length other_args)
848    -- If the class op is scrutinising a lambda bound dictionary then
849    -- give it a discount, to encourage the inlining of this function
850    -- The actual discount is rather arbitrarily chosen
851    arg_discount = case arg1 of
852                     Var dict | dict `elem` top_args
853                              -> unitBag (dict, ufDictDiscount dflags)
854                     _other   -> emptyBag
855
856-- | The size of a function call
857callSize
858 :: Int  -- ^ number of value args
859 -> Int  -- ^ number of value args that are void
860 -> Int
861callSize n_val_args voids = 10 * (1 + n_val_args - voids)
862        -- The 1+ is for the function itself
863        -- Add 1 for each non-trivial arg;
864        -- the allocation cost, as in let(rec)
865
866-- | The size of a jump to a join point
867jumpSize
868 :: Int  -- ^ number of value args
869 -> Int  -- ^ number of value args that are void
870 -> Int
871jumpSize n_val_args voids = 2 * (1 + n_val_args - voids)
872  -- A jump is 20% the size of a function call. Making jumps free reopens
873  -- bug #6048, but making them any more expensive loses a 21% improvement in
874  -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a
875  -- better solution?
876
877funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
878-- Size for functions that are not constructors or primops
879-- Note [Function applications]
880funSize dflags top_args fun n_val_args voids
881  | fun `hasKey` buildIdKey   = buildSize
882  | fun `hasKey` augmentIdKey = augmentSize
883  | otherwise = SizeIs size arg_discount res_discount
884  where
885    some_val_args = n_val_args > 0
886    is_join = isJoinId fun
887
888    size | is_join              = jumpSize n_val_args voids
889         | not some_val_args    = 0
890         | otherwise            = callSize n_val_args voids
891
892        --                  DISCOUNTS
893        --  See Note [Function and non-function discounts]
894    arg_discount | some_val_args && fun `elem` top_args
895                 = unitBag (fun, ufFunAppDiscount dflags)
896                 | otherwise = emptyBag
897        -- If the function is an argument and is applied
898        -- to some values, give it an arg-discount
899
900    res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
901                 | otherwise                = 0
902        -- If the function is partially applied, show a result discount
903-- XXX maybe behave like ConSize for eval'd variable
904
905conSize :: DataCon -> Int -> ExprSize
906conSize dc n_val_args
907  | n_val_args == 0 = SizeIs 0 emptyBag 10    -- Like variables
908
909-- See Note [Unboxed tuple size and result discount]
910  | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args))
911
912-- See Note [Constructor size and result discount]
913  | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args))
914
915-- XXX still looks to large to me
916
917{-
918Note [Constructor size and result discount]
919~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
920Treat a constructors application as size 10, regardless of how many
921arguments it has; we are keen to expose them (and we charge separately
922for their args).  We can't treat them as size zero, else we find that
923(Just x) has size 0, which is the same as a lone variable; and hence
924'v' will always be replaced by (Just x), where v is bound to Just x.
925
926The "result discount" is applied if the result of the call is
927scrutinised (say by a case).  For a constructor application that will
928mean the constructor application will disappear, so we don't need to
929charge it to the function.  So the discount should at least match the
930cost of the constructor application, namely 10.  But to give a bit
931of extra incentive we give a discount of 10*(1 + n_val_args).
932
933Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
934and said it was an "unambiguous win", but its terribly dangerous
935because a function with many many case branches, each finishing with
936a constructor, can have an arbitrarily large discount.  This led to
937terrible code bloat: see #6099.
938
939Note [Unboxed tuple size and result discount]
940~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
941However, unboxed tuples count as size zero. I found occasions where we had
942        f x y z = case op# x y z of { s -> (# s, () #) }
943and f wasn't getting inlined.
944
945I tried giving unboxed tuples a *result discount* of zero (see the
946commented-out line).  Why?  When returned as a result they do not
947allocate, so maybe we don't want to charge so much for them If you
948have a non-zero discount here, we find that workers often get inlined
949back into wrappers, because it look like
950    f x = case $wf x of (# a,b #) -> (a,b)
951and we are keener because of the case.  However while this change
952shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
953more. All other changes were very small. So it's not a big deal but I
954didn't adopt the idea.
955
956Note [Function and non-function discounts]
957~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
958We want a discount if the function is applied. A good example is
959monadic combinators with continuation arguments, where inlining is
960quite important.
961
962But we don't want a big discount when a function is called many times
963(see the detailed comments with #6048) because if the function is
964big it won't be inlined at its many call sites and no benefit results.
965Indeed, we can get exponentially big inlinings this way; that is what
966#6048 is about.
967
968On the other hand, for data-valued arguments, if there are lots of
969case expressions in the body, each one will get smaller if we apply
970the function to a constructor application, so we *want* a big discount
971if the argument is scrutinised by many case expressions.
972
973Conclusion:
974  - For functions, take the max of the discounts
975  - For data values, take the sum of the discounts
976
977
978Note [Literal integer size]
979~~~~~~~~~~~~~~~~~~~~~~~~~~~
980Literal integers *can* be big (mkInteger [...coefficients...]), but
981need not be (S# n).  We just use an arbitrary big-ish constant here
982so that, in particular, we don't inline top-level defns like
983   n = S# 5
984There's no point in doing so -- any optimisations will see the S#
985through n's unfolding.  Nor will a big size inhibit unfoldings functions
986that mention a literal Integer, because the float-out pass will float
987all those constants to top level.
988-}
989
990primOpSize :: PrimOp -> Int -> ExprSize
991primOpSize op n_val_args
992 = if primOpOutOfLine op
993      then sizeN (op_size + n_val_args)
994      else sizeN op_size
995 where
996   op_size = primOpCodeSize op
997
998
999buildSize :: ExprSize
1000buildSize = SizeIs 0 emptyBag 40
1001        -- We really want to inline applications of build
1002        -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
1003        -- Indeed, we should add a result_discount because build is
1004        -- very like a constructor.  We don't bother to check that the
1005        -- build is saturated (it usually is).  The "-2" discounts for the \c n,
1006        -- The "4" is rather arbitrary.
1007
1008augmentSize :: ExprSize
1009augmentSize = SizeIs 0 emptyBag 40
1010        -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
1011        -- e plus ys. The -2 accounts for the \cn
1012
1013-- When we return a lambda, give a discount if it's used (applied)
1014lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
1015lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags)
1016lamScrutDiscount _      TooBig          = TooBig
1017
1018{-
1019Note [addAltSize result discounts]
1020~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1021When adding the size of alternatives, we *add* the result discounts
1022too, rather than take the *maximum*.  For a multi-branch case, this
1023gives a discount for each branch that returns a constructor, making us
1024keener to inline.  I did try using 'max' instead, but it makes nofib
1025'rewrite' and 'puzzle' allocate significantly more, and didn't make
1026binary sizes shrink significantly either.
1027
1028Note [Discounts and thresholds]
1029~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1030Constants for discounts and thesholds are defined in main/DynFlags,
1031all of form ufXxxx.   They are:
1032
1033ufCreationThreshold
1034     At a definition site, if the unfolding is bigger than this, we
1035     may discard it altogether
1036
1037ufUseThreshold
1038     At a call site, if the unfolding, less discounts, is smaller than
1039     this, then it's small enough inline
1040
1041ufKeenessFactor
1042     Factor by which the discounts are multiplied before
1043     subtracting from size
1044
1045ufDictDiscount
1046     The discount for each occurrence of a dictionary argument
1047     as an argument of a class method.  Should be pretty small
1048     else big functions may get inlined
1049
1050ufFunAppDiscount
1051     Discount for a function argument that is applied.  Quite
1052     large, because if we inline we avoid the higher-order call.
1053
1054ufDearOp
1055     The size of a foreign call or not-dupable PrimOp
1056
1057ufVeryAggressive
1058     If True, the compiler ignores all the thresholds and inlines very
1059     aggressively. It still adheres to arity, simplifier phase control and
1060     loop breakers.
1061
1062
1063Note [Function applications]
1064~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1065In a function application (f a b)
1066
1067  - If 'f' is an argument to the function being analysed,
1068    and there's at least one value arg, record a FunAppDiscount for f
1069
1070  - If the application if a PAP (arity > 2 in this example)
1071    record a *result* discount (because inlining
1072    with "extra" args in the call may mean that we now
1073    get a saturated application)
1074
1075Code for manipulating sizes
1076-}
1077
1078-- | The size of a candidate expression for unfolding
1079data ExprSize
1080    = TooBig
1081    | SizeIs { _es_size_is  :: {-# UNPACK #-} !Int -- ^ Size found
1082             , _es_args     :: !(Bag (Id,Int))
1083               -- ^ Arguments cased herein, and discount for each such
1084             , _es_discount :: {-# UNPACK #-} !Int
1085               -- ^ Size to subtract if result is scrutinised by a case
1086               -- expression
1087             }
1088
1089instance Outputable ExprSize where
1090  ppr TooBig         = text "TooBig"
1091  ppr (SizeIs a _ c) = brackets (int a <+> int c)
1092
1093-- subtract the discount before deciding whether to bale out. eg. we
1094-- want to inline a large constructor application into a selector:
1095--      tup = (a_1, ..., a_99)
1096--      x = case tup of ...
1097--
1098mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize
1099mkSizeIs max n xs d | (n - d) > max = TooBig
1100                    | otherwise     = SizeIs n xs d
1101
1102maxSize :: ExprSize -> ExprSize -> ExprSize
1103maxSize TooBig         _                                  = TooBig
1104maxSize _              TooBig                             = TooBig
1105maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2   = s1
1106                                              | otherwise = s2
1107
1108sizeZero :: ExprSize
1109sizeN :: Int -> ExprSize
1110
1111sizeZero = SizeIs 0 emptyBag 0
1112sizeN n  = SizeIs n emptyBag 0
1113
1114{-
1115************************************************************************
1116*                                                                      *
1117\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
1118*                                                                      *
1119************************************************************************
1120
1121We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
1122we ``couldn't possibly use'' on the other side.  Can be overridden w/
1123flaggery.  Just the same as smallEnoughToInline, except that it has no
1124actual arguments.
1125-}
1126
1127couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
1128couldBeSmallEnoughToInline dflags threshold rhs
1129  = case sizeExpr dflags threshold [] body of
1130       TooBig -> False
1131       _      -> True
1132  where
1133    (_, body) = collectBinders rhs
1134
1135----------------
1136smallEnoughToInline :: DynFlags -> Unfolding -> Bool
1137smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
1138  = size <= ufUseThreshold dflags
1139smallEnoughToInline _ _
1140  = False
1141
1142----------------
1143
1144certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding
1145-- ^ Sees if the unfolding is pretty certain to inline.
1146-- If so, return a *stable* unfolding for it, that will always inline.
1147certainlyWillInline dflags fn_info
1148  = case unfoldingInfo fn_info of
1149      CoreUnfolding { uf_tmpl = e, uf_guidance = g }
1150        | loop_breaker -> Nothing      -- Won't inline, so try w/w
1151        | noinline     -> Nothing      -- See Note [Worker-wrapper for NOINLINE functions]
1152        | otherwise    -> do_cunf e g  -- Depends on size, so look at that
1153
1154      DFunUnfolding {} -> Just fn_unf  -- Don't w/w DFuns; it never makes sense
1155                                       -- to do so, and even if it is currently a
1156                                       -- loop breaker, it may not be later
1157
1158      _other_unf       -> Nothing
1159
1160  where
1161    loop_breaker = isStrongLoopBreaker (occInfo fn_info)
1162    noinline     = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline
1163    fn_unf       = unfoldingInfo fn_info
1164
1165    do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding
1166    do_cunf _ UnfNever     = Nothing
1167    do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable })
1168                             -- INLINE functions have UnfWhen
1169
1170        -- The UnfIfGoodArgs case seems important.  If we w/w small functions
1171        -- binary sizes go up by 10%!  (This is with SplitObjs.)
1172        -- I'm not totally sure why.
1173        -- INLINABLE functions come via this path
1174        --    See Note [certainlyWillInline: INLINABLE]
1175    do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args })
1176      | arityInfo fn_info > 0  -- See Note [certainlyWillInline: be careful of thunks]
1177      , not (isBottomingSig (strictnessInfo fn_info))
1178              -- Do not unconditionally inline a bottoming functions even if
1179              -- it seems smallish. We've carefully lifted it out to top level,
1180              -- so we don't want to re-inline it.
1181      , let unf_arity = length args
1182      , size - (10 * (unf_arity + 1)) <= ufUseThreshold dflags
1183      = Just (fn_unf { uf_src      = InlineStable
1184                     , uf_guidance = UnfWhen { ug_arity     = unf_arity
1185                                             , ug_unsat_ok  = unSaturatedOk
1186                                             , ug_boring_ok = inlineBoringOk expr } })
1187             -- Note the "unsaturatedOk". A function like  f = \ab. a
1188             -- will certainly inline, even if partially applied (f e), so we'd
1189             -- better make sure that the transformed inlining has the same property
1190      | otherwise
1191      = Nothing
1192
1193{- Note [certainlyWillInline: be careful of thunks]
1194~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1195Don't claim that thunks will certainly inline, because that risks work
1196duplication.  Even if the work duplication is not great (eg is_cheap
1197holds), it can make a big difference in an inner loop In #5623 we
1198found that the WorkWrap phase thought that
1199       y = case x of F# v -> F# (v +# v)
1200was certainlyWillInline, so the addition got duplicated.
1201
1202Note that we check arityInfo instead of the arity of the unfolding to detect
1203this case. This is so that we don't accidentally fail to inline small partial
1204applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2
1205(say). Here there is no risk of work duplication, and the RHS is tiny, so
1206certainlyWillInline should return True. But `unf_arity` is zero! However f's
1207arity, gotten from `arityInfo fn_info`, is 1.
1208
1209Failing to say that `f` will inline forces W/W to generate a potentially huge
1210worker for f that will immediately cancel with `g`'s wrapper anyway, causing
1211unnecessary churn in the Simplifier while arriving at the same result.
1212
1213Note [certainlyWillInline: INLINABLE]
1214~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1215certainlyWillInline /must/ return Nothing for a large INLINABLE thing,
1216even though we have a stable inlining, so that strictness w/w takes
1217place.  It makes a big difference to efficiency, and the w/w pass knows
1218how to transfer the INLINABLE info to the worker; see WorkWrap
1219Note [Worker-wrapper for INLINABLE functions]
1220
1221************************************************************************
1222*                                                                      *
1223\subsection{callSiteInline}
1224*                                                                      *
1225************************************************************************
1226
1227This is the key function.  It decides whether to inline a variable at a call site
1228
1229callSiteInline is used at call sites, so it is a bit more generous.
1230It's a very important function that embodies lots of heuristics.
1231A non-WHNF can be inlined if it doesn't occur inside a lambda,
1232and occurs exactly once or
1233    occurs once in each branch of a case and is small
1234
1235If the thing is in WHNF, there's no danger of duplicating work,
1236so we can inline if it occurs once, or is small
1237
1238NOTE: we don't want to inline top-level functions that always diverge.
1239It just makes the code bigger.  Tt turns out that the convenient way to prevent
1240them inlining is to give them a NOINLINE pragma, which we do in
1241StrictAnal.addStrictnessInfoToTopId
1242-}
1243
1244callSiteInline :: DynFlags
1245               -> Id                    -- The Id
1246               -> Bool                  -- True <=> unfolding is active
1247               -> Bool                  -- True if there are no arguments at all (incl type args)
1248               -> [ArgSummary]          -- One for each value arg; True if it is interesting
1249               -> CallCtxt              -- True <=> continuation is interesting
1250               -> Maybe CoreExpr        -- Unfolding, if any
1251
1252data ArgSummary = TrivArg       -- Nothing interesting
1253                | NonTrivArg    -- Arg has structure
1254                | ValueArg      -- Arg is a con-app or PAP
1255                                -- ..or con-like. Note [Conlike is interesting]
1256
1257instance Outputable ArgSummary where
1258  ppr TrivArg    = text "TrivArg"
1259  ppr NonTrivArg = text "NonTrivArg"
1260  ppr ValueArg   = text "ValueArg"
1261
1262nonTriv ::  ArgSummary -> Bool
1263nonTriv TrivArg = False
1264nonTriv _       = True
1265
1266data CallCtxt
1267  = BoringCtxt
1268  | RhsCtxt             -- Rhs of a let-binding; see Note [RHS of lets]
1269  | DiscArgCtxt         -- Argument of a function with non-zero arg discount
1270  | RuleArgCtxt         -- We are somewhere in the argument of a function with rules
1271
1272  | ValAppCtxt          -- We're applied to at least one value arg
1273                        -- This arises when we have ((f x |> co) y)
1274                        -- Then the (f x) has argument 'x' but in a ValAppCtxt
1275
1276  | CaseCtxt            -- We're the scrutinee of a case
1277                        -- that decomposes its scrutinee
1278
1279instance Outputable CallCtxt where
1280  ppr CaseCtxt    = text "CaseCtxt"
1281  ppr ValAppCtxt  = text "ValAppCtxt"
1282  ppr BoringCtxt  = text "BoringCtxt"
1283  ppr RhsCtxt     = text "RhsCtxt"
1284  ppr DiscArgCtxt = text "DiscArgCtxt"
1285  ppr RuleArgCtxt = text "RuleArgCtxt"
1286
1287callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info
1288  = case idUnfolding id of
1289      -- idUnfolding checks for loop-breakers, returning NoUnfolding
1290      -- Things with an INLINE pragma may have an unfolding *and*
1291      -- be a loop breaker  (maybe the knot is not yet untied)
1292        CoreUnfolding { uf_tmpl = unf_template
1293                      , uf_is_work_free = is_wf
1294                      , uf_guidance = guidance, uf_expandable = is_exp }
1295          | active_unfolding -> tryUnfolding dflags id lone_variable
1296                                    arg_infos cont_info unf_template
1297                                    is_wf is_exp guidance
1298          | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing
1299        NoUnfolding      -> Nothing
1300        BootUnfolding    -> Nothing
1301        OtherCon {}      -> Nothing
1302        DFunUnfolding {} -> Nothing     -- Never unfold a DFun
1303
1304traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a
1305traceInline dflags inline_id str doc result
1306 | Just prefix <- inlineCheck dflags
1307 =  if prefix `isPrefixOf` occNameString (getOccName inline_id)
1308      then pprTrace str doc result
1309      else result
1310 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags
1311 = pprTrace str doc result
1312 | otherwise
1313 = result
1314
1315-- | This is an awful but temporary workaround for #17615, where the
1316-- case analysis from the 'ufVeryAggressive' selector causes the entire
1317-- 'DynFlags' to be unpacked into local bindings (due to binder swap). This
1318-- results in a tremendous amount of stack spillage, severely bloating the code
1319-- generated for 'callSiteInline'.
1320--
1321-- The right solution here is likely to fix binder swap to avoid this terrible
1322-- behavior (since there are likely other instances of this as well) but this
1323-- case was serious enough that it showed up in a CPU profile and consequently
1324-- I wanted to fix it for 8.10.
1325very_aggressive :: DynFlags -> Bool
1326very_aggressive = ufVeryAggressive
1327{-# NOINLINE very_aggressive #-}
1328
1329tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt
1330             -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance
1331             -> Maybe CoreExpr
1332tryUnfolding dflags id lone_variable
1333             arg_infos cont_info unf_template
1334             is_wf is_exp guidance
1335 = case guidance of
1336     UnfNever -> traceInline dflags id str (text "UnfNever") Nothing
1337
1338     UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok }
1339        | enough_args && (boring_ok || some_benefit || very_aggressive dflags)
1340                -- See Note [INLINE for small functions (3)]
1341        -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template)
1342        | otherwise
1343        -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing
1344        where
1345          some_benefit = calc_some_benefit uf_arity
1346          enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0)
1347
1348     UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
1349        | very_aggressive dflags
1350        -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1351        | is_wf && some_benefit && small_enough
1352        -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template)
1353        | otherwise
1354        -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing
1355        where
1356          some_benefit = calc_some_benefit (length arg_discounts)
1357          extra_doc = text "discounted size =" <+> int discounted_size
1358          discounted_size = size - discount
1359          small_enough = discounted_size <= ufUseThreshold dflags
1360          discount = computeDiscount dflags arg_discounts
1361                                     res_discount arg_infos cont_info
1362
1363  where
1364    mk_doc some_benefit extra_doc yes_or_no
1365      = vcat [ text "arg infos" <+> ppr arg_infos
1366             , text "interesting continuation" <+> ppr cont_info
1367             , text "some_benefit" <+> ppr some_benefit
1368             , text "is exp:" <+> ppr is_exp
1369             , text "is work-free:" <+> ppr is_wf
1370             , text "guidance" <+> ppr guidance
1371             , extra_doc
1372             , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"]
1373
1374    str = "Considering inlining: " ++ showSDocDump dflags (ppr id)
1375    n_val_args = length arg_infos
1376
1377           -- some_benefit is used when the RHS is small enough
1378           -- and the call has enough (or too many) value
1379           -- arguments (ie n_val_args >= arity). But there must
1380           -- be *something* interesting about some argument, or the
1381           -- result context, to make it worth inlining
1382    calc_some_benefit :: Arity -> Bool   -- The Arity is the number of args
1383                                         -- expected by the unfolding
1384    calc_some_benefit uf_arity
1385       | not saturated = interesting_args       -- Under-saturated
1386                                        -- Note [Unsaturated applications]
1387       | otherwise = interesting_args   -- Saturated or over-saturated
1388                  || interesting_call
1389      where
1390        saturated      = n_val_args >= uf_arity
1391        over_saturated = n_val_args > uf_arity
1392        interesting_args = any nonTriv arg_infos
1393                -- NB: (any nonTriv arg_infos) looks at the
1394                -- over-saturated args too which is "wrong";
1395                -- but if over-saturated we inline anyway.
1396
1397        interesting_call
1398          | over_saturated
1399          = True
1400          | otherwise
1401          = case cont_info of
1402              CaseCtxt   -> not (lone_variable && is_exp)  -- Note [Lone variables]
1403              ValAppCtxt -> True                           -- Note [Cast then apply]
1404              RuleArgCtxt -> uf_arity > 0  -- See Note [Unfold info lazy contexts]
1405              DiscArgCtxt -> uf_arity > 0  -- Note [Inlining in ArgCtxt]
1406              RhsCtxt     -> uf_arity > 0  --
1407              _other      -> False         -- See Note [Nested functions]
1408
1409
1410{-
1411Note [Unfold into lazy contexts], Note [RHS of lets]
1412~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1413When the call is the argument of a function with a RULE, or the RHS of a let,
1414we are a little bit keener to inline.  For example
1415     f y = (y,y,y)
1416     g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
1417We'd inline 'f' if the call was in a case context, and it kind-of-is,
1418only we can't see it.  Also
1419     x = f v
1420could be expensive whereas
1421     x = case v of (a,b) -> a
1422is patently cheap and may allow more eta expansion.
1423So we treat the RHS of a let as not-totally-boring.
1424
1425Note [Unsaturated applications]
1426~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1427When a call is not saturated, we *still* inline if one of the
1428arguments has interesting structure.  That's sometimes very important.
1429A good example is the Ord instance for Bool in Base:
1430
1431 Rec {
1432    $fOrdBool =GHC.Classes.D:Ord
1433                 @ Bool
1434                 ...
1435                 $cmin_ajX
1436
1437    $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
1438    $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
1439  }
1440
1441But the defn of GHC.Classes.$dmmin is:
1442
1443  $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
1444    {- Arity: 3, HasNoCafRefs, Strictness: SLL,
1445       Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a ->
1446                   case @ a GHC.Classes.<= @ a $dOrd x y of wild {
1447                     GHC.Types.False -> y GHC.Types.True -> x }) -}
1448
1449We *really* want to inline $dmmin, even though it has arity 3, in
1450order to unravel the recursion.
1451
1452
1453Note [Things to watch]
1454~~~~~~~~~~~~~~~~~~~~~~
1455*   { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
1456    Assume x is exported, so not inlined unconditionally.
1457    Then we want x to inline unconditionally; no reason for it
1458    not to, and doing so avoids an indirection.
1459
1460*   { x = I# 3; ....f x.... }
1461    Make sure that x does not inline unconditionally!
1462    Lest we get extra allocation.
1463
1464Note [Inlining an InlineRule]
1465~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1466An InlineRules is used for
1467  (a) programmer INLINE pragmas
1468  (b) inlinings from worker/wrapper
1469
1470For (a) the RHS may be large, and our contract is that we *only* inline
1471when the function is applied to all the arguments on the LHS of the
1472source-code defn.  (The uf_arity in the rule.)
1473
1474However for worker/wrapper it may be worth inlining even if the
1475arity is not satisfied (as we do in the CoreUnfolding case) so we don't
1476require saturation.
1477
1478Note [Nested functions]
1479~~~~~~~~~~~~~~~~~~~~~~~
1480At one time we treated a call of a non-top-level function as
1481"interesting" (regardless of how boring the context) in the hope
1482that inlining it would eliminate the binding, and its allocation.
1483Specifically, in the default case of interesting_call we had
1484   _other -> not is_top && uf_arity > 0
1485
1486But actually postInlineUnconditionally does some of this and overall
1487it makes virtually no difference to nofib.  So I simplified away this
1488special case
1489
1490Note [Cast then apply]
1491~~~~~~~~~~~~~~~~~~~~~~
1492Consider
1493   myIndex = __inline_me ( (/\a. <blah>) |> co )
1494   co :: (forall a. a -> a) ~ (forall a. T a)
1495     ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
1496
1497We need to inline myIndex to unravel this; but the actual call (myIndex a) has
1498no value arguments.  The ValAppCtxt gives it enough incentive to inline.
1499
1500Note [Inlining in ArgCtxt]
1501~~~~~~~~~~~~~~~~~~~~~~~~~~
1502The condition (arity > 0) here is very important, because otherwise
1503we end up inlining top-level stuff into useless places; eg
1504   x = I# 3#
1505   f = \y.  g x
1506This can make a very big difference: it adds 16% to nofib 'integer' allocs,
1507and 20% to 'power'.
1508
1509At one stage I replaced this condition by 'True' (leading to the above
1510slow-down).  The motivation was test eyeball/inline1.hs; but that seems
1511to work ok now.
1512
1513NOTE: arguably, we should inline in ArgCtxt only if the result of the
1514call is at least CONLIKE.  At least for the cases where we use ArgCtxt
1515for the RHS of a 'let', we only profit from the inlining if we get a
1516CONLIKE thing (modulo lets).
1517
1518Note [Lone variables]   See also Note [Interaction of exprIsWorkFree and lone variables]
1519~~~~~~~~~~~~~~~~~~~~~   which appears below
1520The "lone-variable" case is important.  I spent ages messing about
1521with unsatisfactory variants, but this is nice.  The idea is that if a
1522variable appears all alone
1523
1524        as an arg of lazy fn, or rhs    BoringCtxt
1525        as scrutinee of a case          CaseCtxt
1526        as arg of a fn                  ArgCtxt
1527AND
1528        it is bound to a cheap expression
1529
1530then we should not inline it (unless there is some other reason,
1531e.g. it is the sole occurrence).  That is what is happening at
1532the use of 'lone_variable' in 'interesting_call'.
1533
1534Why?  At least in the case-scrutinee situation, turning
1535        let x = (a,b) in case x of y -> ...
1536into
1537        let x = (a,b) in case (a,b) of y -> ...
1538and thence to
1539        let x = (a,b) in let y = (a,b) in ...
1540is bad if the binding for x will remain.
1541
1542Another example: I discovered that strings
1543were getting inlined straight back into applications of 'error'
1544because the latter is strict.
1545        s = "foo"
1546        f = \x -> ...(error s)...
1547
1548Fundamentally such contexts should not encourage inlining because, provided
1549the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the
1550context can ``see'' the unfolding of the variable (e.g. case or a
1551RULE) so there's no gain.
1552
1553However, watch out:
1554
1555 * Consider this:
1556        foo = _inline_ (\n. [n])
1557        bar = _inline_ (foo 20)
1558        baz = \n. case bar of { (m:_) -> m + n }
1559   Here we really want to inline 'bar' so that we can inline 'foo'
1560   and the whole thing unravels as it should obviously do.  This is
1561   important: in the NDP project, 'bar' generates a closure data
1562   structure rather than a list.
1563
1564   So the non-inlining of lone_variables should only apply if the
1565   unfolding is regarded as cheap; because that is when exprIsConApp_maybe
1566   looks through the unfolding.  Hence the "&& is_wf" in the
1567   InlineRule branch.
1568
1569 * Even a type application or coercion isn't a lone variable.
1570   Consider
1571        case $fMonadST @ RealWorld of { :DMonad a b c -> c }
1572   We had better inline that sucker!  The case won't see through it.
1573
1574   For now, I'm treating treating a variable applied to types
1575   in a *lazy* context "lone". The motivating example was
1576        f = /\a. \x. BIG
1577        g = /\a. \y.  h (f a)
1578   There's no advantage in inlining f here, and perhaps
1579   a significant disadvantage.  Hence some_val_args in the Stop case
1580
1581Note [Interaction of exprIsWorkFree and lone variables]
1582~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1583The lone-variable test says "don't inline if a case expression
1584scrutinises a lone variable whose unfolding is cheap".  It's very
1585important that, under these circumstances, exprIsConApp_maybe
1586can spot a constructor application. So, for example, we don't
1587consider
1588        let x = e in (x,x)
1589to be cheap, and that's good because exprIsConApp_maybe doesn't
1590think that expression is a constructor application.
1591
1592In the 'not (lone_variable && is_wf)' test, I used to test is_value
1593rather than is_wf, which was utterly wrong, because the above
1594expression responds True to exprIsHNF, which is what sets is_value.
1595
1596This kind of thing can occur if you have
1597
1598        {-# INLINE foo #-}
1599        foo = let x = e in (x,x)
1600
1601which Roman did.
1602
1603
1604-}
1605
1606computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt
1607                -> Int
1608computeDiscount dflags arg_discounts res_discount arg_infos cont_info
1609        -- We multiple the raw discounts (args_discount and result_discount)
1610        -- ty opt_UnfoldingKeenessFactor because the former have to do with
1611        --  *size* whereas the discounts imply that there's some extra
1612        --  *efficiency* to be gained (e.g. beta reductions, case reductions)
1613        -- by inlining.
1614
1615  = 10          -- Discount of 10 because the result replaces the call
1616                -- so we count 10 for the function itself
1617
1618    + 10 * length actual_arg_discounts
1619               -- Discount of 10 for each arg supplied,
1620               -- because the result replaces the call
1621
1622    + round (ufKeenessFactor dflags *
1623             fromIntegral (total_arg_discount + res_discount'))
1624  where
1625    actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos
1626    total_arg_discount   = sum actual_arg_discounts
1627
1628    mk_arg_discount _        TrivArg    = 0
1629    mk_arg_discount _        NonTrivArg = 10
1630    mk_arg_discount discount ValueArg   = discount
1631
1632    res_discount'
1633      | LT <- arg_discounts `compareLength` arg_infos
1634      = res_discount   -- Over-saturated
1635      | otherwise
1636      = case cont_info of
1637           BoringCtxt  -> 0
1638           CaseCtxt    -> res_discount  -- Presumably a constructor
1639           ValAppCtxt  -> res_discount  -- Presumably a function
1640           _           -> 40 `min` res_discount
1641                -- ToDo: this 40 `min` res_discount doesn't seem right
1642                --   for DiscArgCtxt it shouldn't matter because the function will
1643                --       get the arg discount for any non-triv arg
1644                --   for RuleArgCtxt we do want to be keener to inline; but not only
1645                --       constructor results
1646                --   for RhsCtxt I suppose that exposing a data con is good in general
1647                --   And 40 seems very arbitrary
1648                --
1649                -- res_discount can be very large when a function returns
1650                -- constructors; but we only want to invoke that large discount
1651                -- when there's a case continuation.
1652                -- Otherwise we, rather arbitrarily, threshold it.  Yuk.
1653                -- But we want to aovid inlining large functions that return
1654                -- constructors into contexts that are simply "interesting"
1655