1{- 2(c) The University of Glasgow 2006 3(c) The AQUA Project, Glasgow University, 1994-1998 4 5 6Core-syntax unfoldings 7 8Unfoldings (which can travel across module boundaries) are in Core 9syntax (namely @CoreExpr@s). 10 11The type @Unfolding@ sits ``above'' simply-Core-expressions 12unfoldings, capturing ``higher-level'' things we know about a binding, 13usually things that the simplifier found out (e.g., ``it's a 14literal''). In the corner of a @CoreUnfolding@ unfolding, you will 15find, unsurprisingly, a Core expression. 16-} 17 18{-# LANGUAGE CPP #-} 19 20{-# OPTIONS_GHC -Wno-incomplete-record-updates #-} 21 22module GHC.Core.Unfold ( 23 Unfolding, UnfoldingGuidance, -- Abstract types 24 25 noUnfolding, 26 mkUnfolding, mkCoreUnfolding, 27 mkFinalUnfolding, mkSimpleUnfolding, mkWorkerUnfolding, 28 mkInlineUnfolding, mkInlineUnfoldingWithArity, 29 mkInlinableUnfolding, mkWwInlineRule, 30 mkCompulsoryUnfolding, mkDFunUnfolding, 31 specUnfolding, 32 33 ArgSummary(..), 34 35 couldBeSmallEnoughToInline, inlineBoringOk, 36 certainlyWillInline, smallEnoughToInline, 37 38 callSiteInline, CallCtxt(..), 39 40 -- Reexport from GHC.Core.Subst (it only live there so it can be used 41 -- by the Very Simple Optimiser) 42 exprIsConApp_maybe, exprIsLiteral_maybe 43 ) where 44 45#include "GhclibHsVersions.h" 46 47import GHC.Prelude 48 49import GHC.Driver.Session 50import GHC.Core 51import GHC.Core.Opt.OccurAnal ( occurAnalyseExpr ) 52import GHC.Core.SimpleOpt 53import GHC.Core.Opt.Arity ( manifestArity ) 54import GHC.Core.Utils 55import GHC.Types.Id 56import GHC.Types.Demand ( StrictSig, isDeadEndSig ) 57import GHC.Core.DataCon 58import GHC.Types.Literal 59import GHC.Builtin.PrimOps 60import GHC.Types.Id.Info 61import GHC.Types.Basic ( Arity, InlineSpec(..), inlinePragmaSpec ) 62import GHC.Core.Type 63import GHC.Builtin.Names 64import GHC.Builtin.Types.Prim ( realWorldStatePrimTy ) 65import GHC.Data.Bag 66import GHC.Utils.Misc 67import GHC.Utils.Outputable 68import GHC.Types.ForeignCall 69import GHC.Types.Name 70import GHC.Utils.Error 71 72import qualified Data.ByteString as BS 73import Data.List 74 75{- 76************************************************************************ 77* * 78\subsection{Making unfoldings} 79* * 80************************************************************************ 81-} 82 83mkFinalUnfolding :: DynFlags -> UnfoldingSource -> StrictSig -> CoreExpr -> Unfolding 84-- "Final" in the sense that this is a GlobalId that will not be further 85-- simplified; so the unfolding should be occurrence-analysed 86mkFinalUnfolding dflags src strict_sig expr 87 = mkUnfolding dflags src 88 True {- Top level -} 89 (isDeadEndSig strict_sig) 90 expr 91 92mkCompulsoryUnfolding :: CoreExpr -> Unfolding 93mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded 94 = mkCoreUnfolding InlineCompulsory True 95 (simpleOptExpr unsafeGlobalDynFlags expr) 96 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter 97 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk }) 98 99 100-- Note [Top-level flag on inline rules] 101-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 102-- Slight hack: note that mk_inline_rules conservatively sets the 103-- top-level flag to True. It gets set more accurately by the simplifier 104-- Simplify.simplUnfolding. 105 106mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding 107mkSimpleUnfolding dflags rhs 108 = mkUnfolding dflags InlineRhs False False rhs 109 110mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding 111mkDFunUnfolding bndrs con ops 112 = DFunUnfolding { df_bndrs = bndrs 113 , df_con = con 114 , df_args = map occurAnalyseExpr ops } 115 -- See Note [Occurrence analysis of unfoldings] 116 117mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding 118mkWwInlineRule dflags expr arity 119 = mkCoreUnfolding InlineStable True 120 (simpleOptExpr dflags expr) 121 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk 122 , ug_boring_ok = boringCxtNotOk }) 123 124mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding 125-- See Note [Worker-wrapper for INLINABLE functions] in GHC.Core.Opt.WorkWrap 126mkWorkerUnfolding dflags work_fn 127 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl 128 , uf_is_top = top_lvl }) 129 | isStableSource src 130 = mkCoreUnfolding src top_lvl new_tmpl guidance 131 where 132 new_tmpl = simpleOptExpr dflags (work_fn tmpl) 133 guidance = calcUnfoldingGuidance dflags False new_tmpl 134 135mkWorkerUnfolding _ _ _ = noUnfolding 136 137-- | Make an unfolding that may be used unsaturated 138-- (ug_unsat_ok = unSaturatedOk) and that is reported as having its 139-- manifest arity (the number of outer lambdas applications will 140-- resolve before doing any work). 141mkInlineUnfolding :: CoreExpr -> Unfolding 142mkInlineUnfolding expr 143 = mkCoreUnfolding InlineStable 144 True -- Note [Top-level flag on inline rules] 145 expr' guide 146 where 147 expr' = simpleOptExpr unsafeGlobalDynFlags expr 148 guide = UnfWhen { ug_arity = manifestArity expr' 149 , ug_unsat_ok = unSaturatedOk 150 , ug_boring_ok = boring_ok } 151 boring_ok = inlineBoringOk expr' 152 153-- | Make an unfolding that will be used once the RHS has been saturated 154-- to the given arity. 155mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding 156mkInlineUnfoldingWithArity arity expr 157 = mkCoreUnfolding InlineStable 158 True -- Note [Top-level flag on inline rules] 159 expr' guide 160 where 161 expr' = simpleOptExpr unsafeGlobalDynFlags expr 162 guide = UnfWhen { ug_arity = arity 163 , ug_unsat_ok = needSaturated 164 , ug_boring_ok = boring_ok } 165 -- See Note [INLINE pragmas and boring contexts] as to why we need to look 166 -- at the arity here. 167 boring_ok | arity == 0 = True 168 | otherwise = inlineBoringOk expr' 169 170mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding 171mkInlinableUnfolding dflags expr 172 = mkUnfolding dflags InlineStable False False expr' 173 where 174 expr' = simpleOptExpr dflags expr 175 176specUnfolding :: DynFlags 177 -> [Var] -> (CoreExpr -> CoreExpr) 178 -> [CoreArg] -- LHS arguments in the RULE 179 -> Unfolding -> Unfolding 180-- See Note [Specialising unfoldings] 181-- specUnfolding spec_bndrs spec_args unf 182-- = \spec_bndrs. unf spec_args 183-- 184specUnfolding dflags spec_bndrs spec_app rule_lhs_args 185 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args }) 186 = ASSERT2( rule_lhs_args `equalLength` old_bndrs 187 , ppr df $$ ppr rule_lhs_args ) 188 -- For this ASSERT see Note [DFunUnfoldings] in GHC.Core.Opt.Specialise 189 mkDFunUnfolding spec_bndrs con (map spec_arg args) 190 -- For DFunUnfoldings we transform 191 -- \obs. MkD <op1> ... <opn> 192 -- to 193 -- \sbs. MkD ((\obs. <op1>) spec_args) ... ditto <opn> 194 where 195 spec_arg arg = simpleOptExpr dflags $ 196 spec_app (mkLams old_bndrs arg) 197 -- The beta-redexes created by spec_app will be 198 -- simplified away by simplOptExpr 199 200specUnfolding dflags spec_bndrs spec_app rule_lhs_args 201 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl 202 , uf_is_top = top_lvl 203 , uf_guidance = old_guidance }) 204 | isStableSource src -- See Note [Specialising unfoldings] 205 , UnfWhen { ug_arity = old_arity } <- old_guidance 206 = mkCoreUnfolding src top_lvl new_tmpl 207 (old_guidance { ug_arity = old_arity - arity_decrease }) 208 where 209 new_tmpl = simpleOptExpr dflags $ 210 mkLams spec_bndrs $ 211 spec_app tmpl -- The beta-redexes created by spec_app 212 -- will besimplified away by simplOptExpr 213 arity_decrease = count isValArg rule_lhs_args - count isId spec_bndrs 214 215 216specUnfolding _ _ _ _ _ = noUnfolding 217 218{- Note [Specialising unfoldings] 219~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 220When we specialise a function for some given type-class arguments, we use 221specUnfolding to specialise its unfolding. Some important points: 222 223* If the original function has a DFunUnfolding, the specialised one 224 must do so too! Otherwise we lose the magic rules that make it 225 interact with ClassOps 226 227* There is a bit of hack for INLINABLE functions: 228 f :: Ord a => .... 229 f = <big-rhs> 230 {- INLINABLE f #-} 231 Now if we specialise f, should the specialised version still have 232 an INLINABLE pragma? If it does, we'll capture a specialised copy 233 of <big-rhs> as its unfolding, and that probably won't inline. But 234 if we don't, the specialised version of <big-rhs> might be small 235 enough to inline at a call site. This happens with Control.Monad.liftM3, 236 and can cause a lot more allocation as a result (nofib n-body shows this). 237 238 Moreover, keeping the INLINABLE thing isn't much help, because 239 the specialised function (probably) isn't overloaded any more. 240 241 Conclusion: drop the INLINEALE pragma. In practice what this means is: 242 if a stable unfolding has UnfoldingGuidance of UnfWhen, 243 we keep it (so the specialised thing too will always inline) 244 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs 245 (which arises from INLINABLE), we discard it 246 247Note [Honour INLINE on 0-ary bindings] 248~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 249Consider 250 251 x = <expensive> 252 {-# INLINE x #-} 253 254 f y = ...x... 255 256The semantics of an INLINE pragma is 257 258 inline x at every call site, provided it is saturated; 259 that is, applied to at least as many arguments as appear 260 on the LHS of the Haskell source definition. 261 262(This source-code-derived arity is stored in the `ug_arity` field of 263the `UnfoldingGuidance`.) 264 265In the example, x's ug_arity is 0, so we should inline it at every use 266site. It's rare to have such an INLINE pragma (usually INLINE Is on 267functions), but it's occasionally very important (#15578, #15519). 268In #15519 we had something like 269 x = case (g a b) of I# r -> T r 270 {-# INLINE x #-} 271 f y = ...(h x).... 272 273where h is strict. So we got 274 f y = ...(case g a b of I# r -> h (T r))... 275 276and that in turn allowed SpecConstr to ramp up performance. 277 278How do we deliver on this? By adjusting the ug_boring_ok 279flag in mkInlineUnfoldingWithArity; see 280Note [INLINE pragmas and boring contexts] 281 282NB: there is a real risk that full laziness will float it right back 283out again. Consider again 284 x = factorial 200 285 {-# INLINE x #-} 286 f y = ...x... 287 288After inlining we get 289 f y = ...(factorial 200)... 290 291but it's entirely possible that full laziness will do 292 lvl23 = factorial 200 293 f y = ...lvl23... 294 295That's a problem for another day. 296 297Note [INLINE pragmas and boring contexts] 298~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 299An INLINE pragma uses mkInlineUnfoldingWithArity to build the 300unfolding. That sets the ug_boring_ok flag to False if the function 301is not tiny (inlineBoringOK), so that even INLINE functions are not 302inlined in an utterly boring context. E.g. 303 \x y. Just (f y x) 304Nothing is gained by inlining f here, even if it has an INLINE 305pragma. 306 307But for 0-ary bindings, we want to inline regardless; see 308Note [Honour INLINE on 0-ary bindings]. 309 310I'm a bit worried that it's possible for the same kind of problem 311to arise for non-0-ary functions too, but let's wait and see. 312-} 313 314mkUnfolding :: DynFlags -> UnfoldingSource 315 -> Bool -- Is top-level 316 -> Bool -- Definitely a bottoming binding 317 -- (only relevant for top-level bindings) 318 -> CoreExpr 319 -> Unfolding 320-- Calculates unfolding guidance 321-- Occurrence-analyses the expression before capturing it 322mkUnfolding dflags src top_lvl is_bottoming expr 323 = mkCoreUnfolding src top_lvl expr guidance 324 where 325 is_top_bottoming = top_lvl && is_bottoming 326 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr 327 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))! 328 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression] 329 330mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr 331 -> UnfoldingGuidance -> Unfolding 332-- Occurrence-analyses the expression before capturing it 333mkCoreUnfolding src top_lvl expr guidance 334 = CoreUnfolding { uf_tmpl = occurAnalyseExpr expr, 335 -- See Note [Occurrence analysis of unfoldings] 336 uf_src = src, 337 uf_is_top = top_lvl, 338 uf_is_value = exprIsHNF expr, 339 uf_is_conlike = exprIsConLike expr, 340 uf_is_work_free = exprIsWorkFree expr, 341 uf_expandable = exprIsExpandable expr, 342 uf_guidance = guidance } 343 344 345{- 346Note [Occurrence analysis of unfoldings] 347~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 348We do occurrence-analysis of unfoldings once and for all, when the 349unfolding is built, rather than each time we inline them. 350 351But given this decision it's vital that we do 352*always* do it. Consider this unfolding 353 \x -> letrec { f = ...g...; g* = f } in body 354where g* is (for some strange reason) the loop breaker. If we don't 355occ-anal it when reading it in, we won't mark g as a loop breaker, and 356we may inline g entirely in body, dropping its binding, and leaving 357the occurrence in f out of scope. This happened in #8892, where 358the unfolding in question was a DFun unfolding. 359 360But more generally, the simplifier is designed on the 361basis that it is looking at occurrence-analysed expressions, so better 362ensure that they actually are. 363 364Note [Calculate unfolding guidance on the non-occ-anal'd expression] 365~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 366Notice that we give the non-occur-analysed expression to 367calcUnfoldingGuidance. In some ways it'd be better to occur-analyse 368first; for example, sometimes during simplification, there's a large 369let-bound thing which has been substituted, and so is now dead; so 370'expr' contains two copies of the thing while the occurrence-analysed 371expression doesn't. 372 373Nevertheless, we *don't* and *must not* occ-analyse before computing 374the size because 375 376a) The size computation bales out after a while, whereas occurrence 377 analysis does not. 378 379b) Residency increases sharply if you occ-anal first. I'm not 380 100% sure why, but it's a large effect. Compiling Cabal went 381 from residency of 534M to over 800M with this one change. 382 383This can occasionally mean that the guidance is very pessimistic; 384it gets fixed up next round. And it should be rare, because large 385let-bound things that are dead are usually caught by preInlineUnconditionally 386 387 388************************************************************************ 389* * 390\subsection{The UnfoldingGuidance type} 391* * 392************************************************************************ 393-} 394 395inlineBoringOk :: CoreExpr -> Bool 396-- See Note [INLINE for small functions] 397-- True => the result of inlining the expression is 398-- no bigger than the expression itself 399-- eg (\x y -> f y x) 400-- This is a quick and dirty version. It doesn't attempt 401-- to deal with (\x y z -> x (y z)) 402-- The really important one is (x `cast` c) 403inlineBoringOk e 404 = go 0 e 405 where 406 go :: Int -> CoreExpr -> Bool 407 go credit (Lam x e) | isId x = go (credit+1) e 408 | otherwise = go credit e 409 -- See Note [Count coercion arguments in boring contexts] 410 go credit (App f (Type {})) = go credit f 411 go credit (App f a) | credit > 0 412 , exprIsTrivial a = go (credit-1) f 413 go credit (Tick _ e) = go credit e -- dubious 414 go credit (Cast e _) = go credit e 415 go credit (Case scrut _ _ [(_,_,rhs)]) -- See Note [Inline unsafeCoerce] 416 | isUnsafeEqualityProof scrut = go credit rhs 417 go _ (Var {}) = boringCxtOk 418 go _ _ = boringCxtNotOk 419 420calcUnfoldingGuidance 421 :: DynFlags 422 -> Bool -- Definitely a top-level, bottoming binding 423 -> CoreExpr -- Expression to look at 424 -> UnfoldingGuidance 425calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr) 426 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding 427 = calcUnfoldingGuidance dflags is_top_bottoming expr 428calcUnfoldingGuidance dflags is_top_bottoming expr 429 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of 430 TooBig -> UnfNever 431 SizeIs size cased_bndrs scrut_discount 432 | uncondInline expr n_val_bndrs size 433 -> UnfWhen { ug_unsat_ok = unSaturatedOk 434 , ug_boring_ok = boringCxtOk 435 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions] 436 437 | is_top_bottoming 438 -> UnfNever -- See Note [Do not inline top-level bottoming functions] 439 440 | otherwise 441 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs 442 , ug_size = size 443 , ug_res = scrut_discount } 444 445 where 446 (bndrs, body) = collectBinders expr 447 bOMB_OUT_SIZE = ufCreationThreshold dflags 448 -- Bomb out if size gets bigger than this 449 val_bndrs = filter isId bndrs 450 n_val_bndrs = length val_bndrs 451 452 mk_discount :: Bag (Id,Int) -> Id -> Int 453 mk_discount cbs bndr = foldl' combine 0 cbs 454 where 455 combine acc (bndr', disc) 456 | bndr == bndr' = acc `plus_disc` disc 457 | otherwise = acc 458 459 plus_disc :: Int -> Int -> Int 460 plus_disc | isFunTy (idType bndr) = max 461 | otherwise = (+) 462 -- See Note [Function and non-function discounts] 463 464{- Note [Inline unsafeCoerce] 465~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 466We really want to inline unsafeCoerce, even when applied to boring 467arguments. It doesn't look as if its RHS is smaller than the call 468 unsafeCoerce x = case unsafeEqualityProof @a @b of UnsafeRefl -> x 469but that case is discarded -- see Note [Implementing unsafeCoerce] 470in base:Unsafe.Coerce. 471 472Moreover, if we /don't/ inline it, we may be left with 473 f (unsafeCoerce x) 474which will build a thunk -- bad, bad, bad. 475 476Conclusion: we really want inlineBoringOk to be True of the RHS of 477unsafeCoerce. This is (U4a) in Note [Implementing unsafeCoerce]. 478 479Note [Computing the size of an expression] 480~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 481The basic idea of sizeExpr is obvious enough: count nodes. But getting the 482heuristics right has taken a long time. Here's the basic strategy: 483 484 * Variables, literals: 0 485 (Exception for string literals, see litSize.) 486 487 * Function applications (f e1 .. en): 1 + #value args 488 489 * Constructor applications: 1, regardless of #args 490 491 * Let(rec): 1 + size of components 492 493 * Note, cast: 0 494 495Examples 496 497 Size Term 498 -------------- 499 0 42# 500 0 x 501 0 True 502 2 f x 503 1 Just x 504 4 f (g x) 505 506Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's 507a function call to account for. Notice also that constructor applications 508are very cheap, because exposing them to a caller is so valuable. 509 510[25/5/11] All sizes are now multiplied by 10, except for primops 511(which have sizes like 1 or 4. This makes primops look fantastically 512cheap, and seems to be almost universally beneficial. Done partly as a 513result of #4978. 514 515Note [Do not inline top-level bottoming functions] 516~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 517The FloatOut pass has gone to some trouble to float out calls to 'error' 518and similar friends. See Note [Bottoming floats] in GHC.Core.Opt.SetLevels. 519Do not re-inline them! But we *do* still inline if they are very small 520(the uncondInline stuff). 521 522Note [INLINE for small functions] 523~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 524Consider {-# INLINE f #-} 525 f x = Just x 526 g y = f y 527Then f's RHS is no larger than its LHS, so we should inline it into 528even the most boring context. In general, f the function is 529sufficiently small that its body is as small as the call itself, the 530inline unconditionally, regardless of how boring the context is. 531 532Things to note: 533 534(1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr) 535 than the thing it's replacing. Notice that 536 (f x) --> (g 3) -- YES, unconditionally 537 (f x) --> x : [] -- YES, *even though* there are two 538 -- arguments to the cons 539 x --> g 3 -- NO 540 x --> Just v -- NO 541 542 It's very important not to unconditionally replace a variable by 543 a non-atomic term. 544 545(2) We do this even if the thing isn't saturated, else we end up with the 546 silly situation that 547 f x y = x 548 ...map (f 3)... 549 doesn't inline. Even in a boring context, inlining without being 550 saturated will give a lambda instead of a PAP, and will be more 551 efficient at runtime. 552 553(3) However, when the function's arity > 0, we do insist that it 554 has at least one value argument at the call site. (This check is 555 made in the UnfWhen case of callSiteInline.) Otherwise we find this: 556 f = /\a \x:a. x 557 d = /\b. MkD (f b) 558 If we inline f here we get 559 d = /\b. MkD (\x:b. x) 560 and then prepareRhs floats out the argument, abstracting the type 561 variables, so we end up with the original again! 562 563(4) We must be much more cautious about arity-zero things. Consider 564 let x = y +# z in ... 565 In *size* terms primops look very small, because the generate a 566 single instruction, but we do not want to unconditionally replace 567 every occurrence of x with (y +# z). So we only do the 568 unconditional-inline thing for *trivial* expressions. 569 570 NB: you might think that PostInlineUnconditionally would do this 571 but it doesn't fire for top-level things; see GHC.Core.Opt.Simplify.Utils 572 Note [Top level and postInlineUnconditionally] 573 574Note [Count coercion arguments in boring contexts] 575~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 576In inlineBoringOK, we ignore type arguments when deciding whether an 577expression is okay to inline into boring contexts. This is good, since 578if we have a definition like 579 580 let y = x @Int in f y y 581 582there’s no reason not to inline y at both use sites — no work is 583actually duplicated. It may seem like the same reasoning applies to 584coercion arguments, and indeed, in #17182 we changed inlineBoringOK to 585treat coercions the same way. 586 587However, this isn’t a good idea: unlike type arguments, which have 588no runtime representation, coercion arguments *do* have a runtime 589representation (albeit the zero-width VoidRep, see Note [Coercion tokens] 590in "GHC.CoreToStg"). This caused trouble in #17787 for DataCon wrappers for 591nullary GADT constructors: the wrappers would be inlined and each use of 592the constructor would lead to a separate allocation instead of just 593sharing the wrapper closure. 594 595The solution: don’t ignore coercion arguments after all. 596-} 597 598uncondInline :: CoreExpr -> Arity -> Int -> Bool 599-- Inline unconditionally if there no size increase 600-- Size of call is arity (+1 for the function) 601-- See Note [INLINE for small functions] 602uncondInline rhs arity size 603 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1) 604 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4) 605 606sizeExpr :: DynFlags 607 -> Int -- Bomb out if it gets bigger than this 608 -> [Id] -- Arguments; we're interested in which of these 609 -- get case'd 610 -> CoreExpr 611 -> ExprSize 612 613-- Note [Computing the size of an expression] 614 615sizeExpr dflags bOMB_OUT_SIZE top_args expr 616 = size_up expr 617 where 618 size_up (Cast e _) = size_up e 619 size_up (Tick _ e) = size_up e 620 size_up (Type _) = sizeZero -- Types cost nothing 621 size_up (Coercion _) = sizeZero 622 size_up (Lit lit) = sizeN (litSize lit) 623 size_up (Var f) | isRealWorldId f = sizeZero 624 -- Make sure we get constructor discounts even 625 -- on nullary constructors 626 | otherwise = size_up_call f [] 0 627 628 size_up (App fun arg) 629 | isTyCoArg arg = size_up fun 630 | otherwise = size_up arg `addSizeNSD` 631 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0) 632 633 size_up (Lam b e) 634 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10) 635 | otherwise = size_up e 636 637 size_up (Let (NonRec binder rhs) body) 638 = size_up_rhs (binder, rhs) `addSizeNSD` 639 size_up body `addSizeN` 640 size_up_alloc binder 641 642 size_up (Let (Rec pairs) body) 643 = foldr (addSizeNSD . size_up_rhs) 644 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs)) 645 pairs 646 647 size_up (Case e _ _ alts) 648 | null alts 649 = size_up e -- case e of {} never returns, so take size of scrutinee 650 651 size_up (Case e _ _ alts) 652 -- Now alts is non-empty 653 | Just v <- is_top_arg e -- We are scrutinising an argument variable 654 = let 655 alt_sizes = map size_up_alt alts 656 657 -- alts_size tries to compute a good discount for 658 -- the case when we are scrutinising an argument variable 659 alts_size (SizeIs tot tot_disc tot_scrut) 660 -- Size of all alternatives 661 (SizeIs max _ _) 662 -- Size of biggest alternative 663 = SizeIs tot (unitBag (v, 20 + tot - max) 664 `unionBags` tot_disc) tot_scrut 665 -- If the variable is known, we produce a 666 -- discount that will take us back to 'max', 667 -- the size of the largest alternative The 668 -- 1+ is a little discount for reduced 669 -- allocation in the caller 670 -- 671 -- Notice though, that we return tot_disc, 672 -- the total discount from all branches. I 673 -- think that's right. 674 675 alts_size tot_size _ = tot_size 676 in 677 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty 678 (foldr1 maxSize alt_sizes) 679 -- Good to inline if an arg is scrutinised, because 680 -- that may eliminate allocation in the caller 681 -- And it eliminates the case itself 682 where 683 is_top_arg (Var v) | v `elem` top_args = Just v 684 is_top_arg (Cast e _) = is_top_arg e 685 is_top_arg _ = Nothing 686 687 688 size_up (Case e _ _ alts) = size_up e `addSizeNSD` 689 foldr (addAltSize . size_up_alt) case_size alts 690 where 691 case_size 692 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10) 693 | otherwise = sizeZero 694 -- Normally we don't charge for the case itself, but 695 -- we charge one per alternative (see size_up_alt, 696 -- below) to account for the cost of the info table 697 -- and comparisons. 698 -- 699 -- However, in certain cases (see is_inline_scrut 700 -- below), no code is generated for the case unless 701 -- there are multiple alts. In these cases we 702 -- subtract one, making the first alt free. 703 -- e.g. case x# +# y# of _ -> ... should cost 1 704 -- case touch# x# of _ -> ... should cost 0 705 -- (see #4978) 706 -- 707 -- I would like to not have the "lengthAtMost alts 1" 708 -- condition above, but without that some programs got worse 709 -- (spectral/hartel/event and spectral/para). I don't fully 710 -- understand why. (SDM 24/5/11) 711 712 -- unboxed variables, inline primops and unsafe foreign calls 713 -- are all "inline" things: 714 is_inline_scrut (Var v) = isUnliftedType (idType v) 715 is_inline_scrut scrut 716 | (Var f, _) <- collectArgs scrut 717 = case idDetails f of 718 FCallId fc -> not (isSafeForeignCall fc) 719 PrimOpId op -> not (primOpOutOfLine op) 720 _other -> False 721 | otherwise 722 = False 723 724 size_up_rhs (bndr, rhs) 725 | Just join_arity <- isJoinId_maybe bndr 726 -- Skip arguments to join point 727 , (_bndrs, body) <- collectNBinders join_arity rhs 728 = size_up body 729 | otherwise 730 = size_up rhs 731 732 ------------ 733 -- size_up_app is used when there's ONE OR MORE value args 734 size_up_app (App fun arg) args voids 735 | isTyCoArg arg = size_up_app fun args voids 736 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1) 737 | otherwise = size_up arg `addSizeNSD` 738 size_up_app fun (arg:args) voids 739 size_up_app (Var fun) args voids = size_up_call fun args voids 740 size_up_app (Tick _ expr) args voids = size_up_app expr args voids 741 size_up_app (Cast expr _) args voids = size_up_app expr args voids 742 size_up_app other args voids = size_up other `addSizeN` 743 callSize (length args) voids 744 -- if the lhs is not an App or a Var, or an invisible thing like a 745 -- Tick or Cast, then we should charge for a complete call plus the 746 -- size of the lhs itself. 747 748 ------------ 749 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize 750 size_up_call fun val_args voids 751 = case idDetails fun of 752 FCallId _ -> sizeN (callSize (length val_args) voids) 753 DataConWorkId dc -> conSize dc (length val_args) 754 PrimOpId op -> primOpSize op (length val_args) 755 ClassOpId _ -> classOpSize dflags top_args val_args 756 _ -> funSize dflags top_args fun (length val_args) voids 757 758 ------------ 759 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10 760 -- Don't charge for args, so that wrappers look cheap 761 -- (See comments about wrappers with Case) 762 -- 763 -- IMPORTANT: *do* charge 1 for the alternative, else we 764 -- find that giant case nests are treated as practically free 765 -- A good example is Foreign.C.Error.errnoToIOError 766 767 ------------ 768 -- Cost to allocate binding with given binder 769 size_up_alloc bndr 770 | isTyVar bndr -- Doesn't exist at runtime 771 || isJoinId bndr -- Not allocated at all 772 || isUnliftedType (idType bndr) -- Doesn't live in heap 773 = 0 774 | otherwise 775 = 10 776 777 ------------ 778 -- These addSize things have to be here because 779 -- I don't want to give them bOMB_OUT_SIZE as an argument 780 addSizeN TooBig _ = TooBig 781 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d 782 783 -- addAltSize is used to add the sizes of case alternatives 784 addAltSize TooBig _ = TooBig 785 addAltSize _ TooBig = TooBig 786 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2) 787 = mkSizeIs bOMB_OUT_SIZE (n1 + n2) 788 (xs `unionBags` ys) 789 (d1 + d2) -- Note [addAltSize result discounts] 790 791 -- This variant ignores the result discount from its LEFT argument 792 -- It's used when the second argument isn't part of the result 793 addSizeNSD TooBig _ = TooBig 794 addSizeNSD _ TooBig = TooBig 795 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2) 796 = mkSizeIs bOMB_OUT_SIZE (n1 + n2) 797 (xs `unionBags` ys) 798 d2 -- Ignore d1 799 800 isRealWorldId id = idType id `eqType` realWorldStatePrimTy 801 802 -- an expression of type State# RealWorld must be a variable 803 isRealWorldExpr (Var id) = isRealWorldId id 804 isRealWorldExpr (Tick _ e) = isRealWorldExpr e 805 isRealWorldExpr _ = False 806 807-- | Finds a nominal size of a string literal. 808litSize :: Literal -> Int 809-- Used by GHC.Core.Unfold.sizeExpr 810litSize (LitNumber LitNumInteger _) = 100 -- Note [Size of literal integers] 811litSize (LitNumber LitNumNatural _) = 100 812litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4) 813 -- If size could be 0 then @f "x"@ might be too small 814 -- [Sept03: make literal strings a bit bigger to avoid fruitless 815 -- duplication of little strings] 816litSize _other = 0 -- Must match size of nullary constructors 817 -- Key point: if x |-> 4, then x must inline unconditionally 818 -- (eg via case binding) 819 820classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize 821-- See Note [Conlike is interesting] 822classOpSize _ _ [] 823 = sizeZero 824classOpSize dflags top_args (arg1 : other_args) 825 = SizeIs size arg_discount 0 826 where 827 size = 20 + (10 * length other_args) 828 -- If the class op is scrutinising a lambda bound dictionary then 829 -- give it a discount, to encourage the inlining of this function 830 -- The actual discount is rather arbitrarily chosen 831 arg_discount = case arg1 of 832 Var dict | dict `elem` top_args 833 -> unitBag (dict, ufDictDiscount dflags) 834 _other -> emptyBag 835 836-- | The size of a function call 837callSize 838 :: Int -- ^ number of value args 839 -> Int -- ^ number of value args that are void 840 -> Int 841callSize n_val_args voids = 10 * (1 + n_val_args - voids) 842 -- The 1+ is for the function itself 843 -- Add 1 for each non-trivial arg; 844 -- the allocation cost, as in let(rec) 845 846-- | The size of a jump to a join point 847jumpSize 848 :: Int -- ^ number of value args 849 -> Int -- ^ number of value args that are void 850 -> Int 851jumpSize n_val_args voids = 2 * (1 + n_val_args - voids) 852 -- A jump is 20% the size of a function call. Making jumps free reopens 853 -- bug #6048, but making them any more expensive loses a 21% improvement in 854 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a 855 -- better solution? 856 857funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize 858-- Size for functions that are not constructors or primops 859-- Note [Function applications] 860funSize dflags top_args fun n_val_args voids 861 | fun `hasKey` buildIdKey = buildSize 862 | fun `hasKey` augmentIdKey = augmentSize 863 | otherwise = SizeIs size arg_discount res_discount 864 where 865 some_val_args = n_val_args > 0 866 is_join = isJoinId fun 867 868 size | is_join = jumpSize n_val_args voids 869 | not some_val_args = 0 870 | otherwise = callSize n_val_args voids 871 872 -- DISCOUNTS 873 -- See Note [Function and non-function discounts] 874 arg_discount | some_val_args && fun `elem` top_args 875 = unitBag (fun, ufFunAppDiscount dflags) 876 | otherwise = emptyBag 877 -- If the function is an argument and is applied 878 -- to some values, give it an arg-discount 879 880 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags 881 | otherwise = 0 882 -- If the function is partially applied, show a result discount 883-- XXX maybe behave like ConSize for eval'd variable 884 885conSize :: DataCon -> Int -> ExprSize 886conSize dc n_val_args 887 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables 888 889-- See Note [Unboxed tuple size and result discount] 890 | isUnboxedTupleCon dc = SizeIs 0 emptyBag 10 891 892-- See Note [Constructor size and result discount] 893 | otherwise = SizeIs 10 emptyBag 10 894 895{- Note [Constructor size and result discount] 896~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 897Treat a constructors application as size 10, regardless of how many 898arguments it has; we are keen to expose them (and we charge separately 899for their args). We can't treat them as size zero, else we find that 900(Just x) has size 0, which is the same as a lone variable; and hence 901'v' will always be replaced by (Just x), where v is bound to Just x. 902 903The "result discount" is applied if the result of the call is 904scrutinised (say by a case). For a constructor application that will 905mean the constructor application will disappear, so we don't need to 906charge it to the function. So the discount should at least match the 907cost of the constructor application, namely 10. 908 909Historical note 1: Until Jun 2020 we gave it a "bit of extra 910incentive" via a discount of 10*(1 + n_val_args), but that was FAR too 911much (#18282). In particular, consider a huge case tree like 912 913 let r = case y1 of 914 Nothing -> B1 a b c 915 Just v1 -> case y2 of 916 Nothing -> B1 c b a 917 Just v2 -> ... 918 919If conSize gives a cost of 10 (regardless of n_val_args) and a 920discount of 10, that'll make each alternative RHS cost zero. We 921charge 10 for each case alternative (see size_up_alt). If we give a 922bigger discount (say 20) in conSize, we'll make the case expression 923cost *nothing*, and that can make a huge case tree cost nothing. This 924leads to massive, sometimes exponenial inlinings (#18282). In short, 925don't give a discount that give a negative size to a sub-expression! 926 927Historical note 2: Much longer ago, Simon M tried a MUCH bigger 928discount: (10 * (10 + n_val_args)), and said it was an "unambiguous 929win", but its terribly dangerous because a function with many many 930case branches, each finishing with a constructor, can have an 931arbitrarily large discount. This led to terrible code bloat: see 932#6099. 933 934Note [Unboxed tuple size and result discount] 935~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 936However, unboxed tuples count as size zero. I found occasions where we had 937 f x y z = case op# x y z of { s -> (# s, () #) } 938and f wasn't getting inlined. 939 940I tried giving unboxed tuples a *result discount* of zero (see the 941commented-out line). Why? When returned as a result they do not 942allocate, so maybe we don't want to charge so much for them. If you 943have a non-zero discount here, we find that workers often get inlined 944back into wrappers, because it look like 945 f x = case $wf x of (# a,b #) -> (a,b) 946and we are keener because of the case. However while this change 947shrank binary sizes by 0.5% it also made spectral/boyer allocate 5% 948more. All other changes were very small. So it's not a big deal but I 949didn't adopt the idea. 950 951When fixing #18282 (see Note [Constructor size and result discount]) 952I changed the result discount to be just 10, not 10*(1+n_val_args). 953 954Note [Function and non-function discounts] 955~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 956We want a discount if the function is applied. A good example is 957monadic combinators with continuation arguments, where inlining is 958quite important. 959 960But we don't want a big discount when a function is called many times 961(see the detailed comments with #6048) because if the function is 962big it won't be inlined at its many call sites and no benefit results. 963Indeed, we can get exponentially big inlinings this way; that is what 964#6048 is about. 965 966On the other hand, for data-valued arguments, if there are lots of 967case expressions in the body, each one will get smaller if we apply 968the function to a constructor application, so we *want* a big discount 969if the argument is scrutinised by many case expressions. 970 971Conclusion: 972 - For functions, take the max of the discounts 973 - For data values, take the sum of the discounts 974 975 976Note [Literal integer size] 977~~~~~~~~~~~~~~~~~~~~~~~~~~~ 978Literal integers *can* be big (mkInteger [...coefficients...]), but 979need not be (IS n). We just use an arbitrary big-ish constant here 980so that, in particular, we don't inline top-level defns like 981 n = IS 5 982There's no point in doing so -- any optimisations will see the IS 983through n's unfolding. Nor will a big size inhibit unfoldings functions 984that mention a literal Integer, because the float-out pass will float 985all those constants to top level. 986-} 987 988primOpSize :: PrimOp -> Int -> ExprSize 989primOpSize op n_val_args 990 = if primOpOutOfLine op 991 then sizeN (op_size + n_val_args) 992 else sizeN op_size 993 where 994 op_size = primOpCodeSize op 995 996 997buildSize :: ExprSize 998buildSize = SizeIs 0 emptyBag 40 999 -- We really want to inline applications of build 1000 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later) 1001 -- Indeed, we should add a result_discount because build is 1002 -- very like a constructor. We don't bother to check that the 1003 -- build is saturated (it usually is). The "-2" discounts for the \c n, 1004 -- The "4" is rather arbitrary. 1005 1006augmentSize :: ExprSize 1007augmentSize = SizeIs 0 emptyBag 40 1008 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of 1009 -- e plus ys. The -2 accounts for the \cn 1010 1011-- When we return a lambda, give a discount if it's used (applied) 1012lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize 1013lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags) 1014lamScrutDiscount _ TooBig = TooBig 1015 1016{- 1017Note [addAltSize result discounts] 1018~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1019When adding the size of alternatives, we *add* the result discounts 1020too, rather than take the *maximum*. For a multi-branch case, this 1021gives a discount for each branch that returns a constructor, making us 1022keener to inline. I did try using 'max' instead, but it makes nofib 1023'rewrite' and 'puzzle' allocate significantly more, and didn't make 1024binary sizes shrink significantly either. 1025 1026Note [Discounts and thresholds] 1027~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1028Constants for discounts and thesholds are defined in "GHC.Driver.Session", 1029all of form ufXxxx. They are: 1030 1031ufCreationThreshold 1032 At a definition site, if the unfolding is bigger than this, we 1033 may discard it altogether 1034 1035ufUseThreshold 1036 At a call site, if the unfolding, less discounts, is smaller than 1037 this, then it's small enough inline 1038 1039ufDictDiscount 1040 The discount for each occurrence of a dictionary argument 1041 as an argument of a class method. Should be pretty small 1042 else big functions may get inlined 1043 1044ufFunAppDiscount 1045 Discount for a function argument that is applied. Quite 1046 large, because if we inline we avoid the higher-order call. 1047 1048ufDearOp 1049 The size of a foreign call or not-dupable PrimOp 1050 1051ufVeryAggressive 1052 If True, the compiler ignores all the thresholds and inlines very 1053 aggressively. It still adheres to arity, simplifier phase control and 1054 loop breakers. 1055 1056 1057Historical Note: Before April 2020 we had another factor, 1058ufKeenessFactor, which would scale the discounts before they were subtracted 1059from the size. This was justified with the following comment: 1060 1061 -- We multiply the raw discounts (args_discount and result_discount) 1062 -- ty opt_UnfoldingKeenessFactor because the former have to do with 1063 -- *size* whereas the discounts imply that there's some extra 1064 -- *efficiency* to be gained (e.g. beta reductions, case reductions) 1065 -- by inlining. 1066 1067However, this is highly suspect since it means that we subtract a *scaled* size 1068from an absolute size, resulting in crazy (e.g. negative) scores in some cases 1069(#15304). We consequently killed off ufKeenessFactor and bumped up the 1070ufUseThreshold to compensate. 1071 1072 1073Note [Function applications] 1074~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1075In a function application (f a b) 1076 1077 - If 'f' is an argument to the function being analysed, 1078 and there's at least one value arg, record a FunAppDiscount for f 1079 1080 - If the application if a PAP (arity > 2 in this example) 1081 record a *result* discount (because inlining 1082 with "extra" args in the call may mean that we now 1083 get a saturated application) 1084 1085Code for manipulating sizes 1086-} 1087 1088-- | The size of a candidate expression for unfolding 1089data ExprSize 1090 = TooBig 1091 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found 1092 , _es_args :: !(Bag (Id,Int)) 1093 -- ^ Arguments cased herein, and discount for each such 1094 , _es_discount :: {-# UNPACK #-} !Int 1095 -- ^ Size to subtract if result is scrutinised by a case 1096 -- expression 1097 } 1098 1099instance Outputable ExprSize where 1100 ppr TooBig = text "TooBig" 1101 ppr (SizeIs a _ c) = brackets (int a <+> int c) 1102 1103-- subtract the discount before deciding whether to bale out. eg. we 1104-- want to inline a large constructor application into a selector: 1105-- tup = (a_1, ..., a_99) 1106-- x = case tup of ... 1107-- 1108mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize 1109mkSizeIs max n xs d | (n - d) > max = TooBig 1110 | otherwise = SizeIs n xs d 1111 1112maxSize :: ExprSize -> ExprSize -> ExprSize 1113maxSize TooBig _ = TooBig 1114maxSize _ TooBig = TooBig 1115maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1 1116 | otherwise = s2 1117 1118sizeZero :: ExprSize 1119sizeN :: Int -> ExprSize 1120 1121sizeZero = SizeIs 0 emptyBag 0 1122sizeN n = SizeIs n emptyBag 0 1123 1124{- 1125************************************************************************ 1126* * 1127\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding} 1128* * 1129************************************************************************ 1130 1131We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that 1132we ``couldn't possibly use'' on the other side. Can be overridden w/ 1133flaggery. Just the same as smallEnoughToInline, except that it has no 1134actual arguments. 1135-} 1136 1137couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool 1138couldBeSmallEnoughToInline dflags threshold rhs 1139 = case sizeExpr dflags threshold [] body of 1140 TooBig -> False 1141 _ -> True 1142 where 1143 (_, body) = collectBinders rhs 1144 1145---------------- 1146smallEnoughToInline :: DynFlags -> Unfolding -> Bool 1147smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}}) 1148 = size <= ufUseThreshold dflags 1149smallEnoughToInline _ _ 1150 = False 1151 1152---------------- 1153 1154certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding 1155-- ^ Sees if the unfolding is pretty certain to inline. 1156-- If so, return a *stable* unfolding for it, that will always inline. 1157certainlyWillInline dflags fn_info 1158 = case unfoldingInfo fn_info of 1159 CoreUnfolding { uf_tmpl = e, uf_guidance = g } 1160 | loop_breaker -> Nothing -- Won't inline, so try w/w 1161 | noinline -> Nothing -- See Note [Worker-wrapper for NOINLINE functions] 1162 | otherwise -> do_cunf e g -- Depends on size, so look at that 1163 1164 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense 1165 -- to do so, and even if it is currently a 1166 -- loop breaker, it may not be later 1167 1168 _other_unf -> Nothing 1169 1170 where 1171 loop_breaker = isStrongLoopBreaker (occInfo fn_info) 1172 noinline = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline 1173 fn_unf = unfoldingInfo fn_info 1174 1175 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding 1176 do_cunf _ UnfNever = Nothing 1177 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable }) 1178 -- INLINE functions have UnfWhen 1179 1180 -- The UnfIfGoodArgs case seems important. If we w/w small functions 1181 -- binary sizes go up by 10%! (This is with SplitObjs.) 1182 -- I'm not totally sure why. 1183 -- INLINABLE functions come via this path 1184 -- See Note [certainlyWillInline: INLINABLE] 1185 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args }) 1186 | arityInfo fn_info > 0 -- See Note [certainlyWillInline: be careful of thunks] 1187 , not (isDeadEndSig (strictnessInfo fn_info)) 1188 -- Do not unconditionally inline a bottoming functions even if 1189 -- it seems smallish. We've carefully lifted it out to top level, 1190 -- so we don't want to re-inline it. 1191 , let unf_arity = length args 1192 , size - (10 * (unf_arity + 1)) <= ufUseThreshold dflags 1193 = Just (fn_unf { uf_src = InlineStable 1194 , uf_guidance = UnfWhen { ug_arity = unf_arity 1195 , ug_unsat_ok = unSaturatedOk 1196 , ug_boring_ok = inlineBoringOk expr } }) 1197 -- Note the "unsaturatedOk". A function like f = \ab. a 1198 -- will certainly inline, even if partially applied (f e), so we'd 1199 -- better make sure that the transformed inlining has the same property 1200 | otherwise 1201 = Nothing 1202 1203{- Note [certainlyWillInline: be careful of thunks] 1204~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1205Don't claim that thunks will certainly inline, because that risks work 1206duplication. Even if the work duplication is not great (eg is_cheap 1207holds), it can make a big difference in an inner loop In #5623 we 1208found that the WorkWrap phase thought that 1209 y = case x of F# v -> F# (v +# v) 1210was certainlyWillInline, so the addition got duplicated. 1211 1212Note that we check arityInfo instead of the arity of the unfolding to detect 1213this case. This is so that we don't accidentally fail to inline small partial 1214applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2 1215(say). Here there is no risk of work duplication, and the RHS is tiny, so 1216certainlyWillInline should return True. But `unf_arity` is zero! However f's 1217arity, gotten from `arityInfo fn_info`, is 1. 1218 1219Failing to say that `f` will inline forces W/W to generate a potentially huge 1220worker for f that will immediately cancel with `g`'s wrapper anyway, causing 1221unnecessary churn in the Simplifier while arriving at the same result. 1222 1223Note [certainlyWillInline: INLINABLE] 1224~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1225certainlyWillInline /must/ return Nothing for a large INLINABLE thing, 1226even though we have a stable inlining, so that strictness w/w takes 1227place. It makes a big difference to efficiency, and the w/w pass knows 1228how to transfer the INLINABLE info to the worker; see WorkWrap 1229Note [Worker-wrapper for INLINABLE functions] 1230 1231************************************************************************ 1232* * 1233\subsection{callSiteInline} 1234* * 1235************************************************************************ 1236 1237This is the key function. It decides whether to inline a variable at a call site 1238 1239callSiteInline is used at call sites, so it is a bit more generous. 1240It's a very important function that embodies lots of heuristics. 1241A non-WHNF can be inlined if it doesn't occur inside a lambda, 1242and occurs exactly once or 1243 occurs once in each branch of a case and is small 1244 1245If the thing is in WHNF, there's no danger of duplicating work, 1246so we can inline if it occurs once, or is small 1247 1248NOTE: we don't want to inline top-level functions that always diverge. 1249It just makes the code bigger. Tt turns out that the convenient way to prevent 1250them inlining is to give them a NOINLINE pragma, which we do in 1251StrictAnal.addStrictnessInfoToTopId 1252-} 1253 1254callSiteInline :: DynFlags 1255 -> Id -- The Id 1256 -> Bool -- True <=> unfolding is active 1257 -> Bool -- True if there are no arguments at all (incl type args) 1258 -> [ArgSummary] -- One for each value arg; True if it is interesting 1259 -> CallCtxt -- True <=> continuation is interesting 1260 -> Maybe CoreExpr -- Unfolding, if any 1261 1262data ArgSummary = TrivArg -- Nothing interesting 1263 | NonTrivArg -- Arg has structure 1264 | ValueArg -- Arg is a con-app or PAP 1265 -- ..or con-like. Note [Conlike is interesting] 1266 1267instance Outputable ArgSummary where 1268 ppr TrivArg = text "TrivArg" 1269 ppr NonTrivArg = text "NonTrivArg" 1270 ppr ValueArg = text "ValueArg" 1271 1272nonTriv :: ArgSummary -> Bool 1273nonTriv TrivArg = False 1274nonTriv _ = True 1275 1276data CallCtxt 1277 = BoringCtxt 1278 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets] 1279 | DiscArgCtxt -- Argument of a function with non-zero arg discount 1280 | RuleArgCtxt -- We are somewhere in the argument of a function with rules 1281 1282 | ValAppCtxt -- We're applied to at least one value arg 1283 -- This arises when we have ((f x |> co) y) 1284 -- Then the (f x) has argument 'x' but in a ValAppCtxt 1285 1286 | CaseCtxt -- We're the scrutinee of a case 1287 -- that decomposes its scrutinee 1288 1289instance Outputable CallCtxt where 1290 ppr CaseCtxt = text "CaseCtxt" 1291 ppr ValAppCtxt = text "ValAppCtxt" 1292 ppr BoringCtxt = text "BoringCtxt" 1293 ppr RhsCtxt = text "RhsCtxt" 1294 ppr DiscArgCtxt = text "DiscArgCtxt" 1295 ppr RuleArgCtxt = text "RuleArgCtxt" 1296 1297callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info 1298 = case idUnfolding id of 1299 -- idUnfolding checks for loop-breakers, returning NoUnfolding 1300 -- Things with an INLINE pragma may have an unfolding *and* 1301 -- be a loop breaker (maybe the knot is not yet untied) 1302 CoreUnfolding { uf_tmpl = unf_template 1303 , uf_is_work_free = is_wf 1304 , uf_guidance = guidance, uf_expandable = is_exp } 1305 | active_unfolding -> tryUnfolding dflags id lone_variable 1306 arg_infos cont_info unf_template 1307 is_wf is_exp guidance 1308 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing 1309 NoUnfolding -> Nothing 1310 BootUnfolding -> Nothing 1311 OtherCon {} -> Nothing 1312 DFunUnfolding {} -> Nothing -- Never unfold a DFun 1313 1314-- | Report the inlining of an identifier's RHS to the user, if requested. 1315traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a 1316traceInline dflags inline_id str doc result 1317 -- We take care to ensure that doc is used in only one branch, ensuring that 1318 -- the simplifier can push its allocation into the branch. See Note [INLINE 1319 -- conditional tracing utilities]. 1320 | enable = traceAction dflags str doc result 1321 | otherwise = result 1322 where 1323 enable 1324 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags 1325 = True 1326 | Just prefix <- inlineCheck dflags 1327 = prefix `isPrefixOf` occNameString (getOccName inline_id) 1328 | otherwise 1329 = False 1330{-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities] 1331 1332tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt 1333 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance 1334 -> Maybe CoreExpr 1335tryUnfolding dflags id lone_variable 1336 arg_infos cont_info unf_template 1337 is_wf is_exp guidance 1338 = case guidance of 1339 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing 1340 1341 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok } 1342 | enough_args && (boring_ok || some_benefit || ufVeryAggressive dflags) 1343 -- See Note [INLINE for small functions (3)] 1344 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template) 1345 | otherwise 1346 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing 1347 where 1348 some_benefit = calc_some_benefit uf_arity 1349 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0) 1350 1351 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size } 1352 | ufVeryAggressive dflags 1353 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template) 1354 | is_wf && some_benefit && small_enough 1355 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template) 1356 | otherwise 1357 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing 1358 where 1359 some_benefit = calc_some_benefit (length arg_discounts) 1360 extra_doc = text "discounted size =" <+> int discounted_size 1361 discounted_size = size - discount 1362 small_enough = discounted_size <= ufUseThreshold dflags 1363 discount = computeDiscount arg_discounts res_discount arg_infos cont_info 1364 1365 where 1366 mk_doc some_benefit extra_doc yes_or_no 1367 = vcat [ text "arg infos" <+> ppr arg_infos 1368 , text "interesting continuation" <+> ppr cont_info 1369 , text "some_benefit" <+> ppr some_benefit 1370 , text "is exp:" <+> ppr is_exp 1371 , text "is work-free:" <+> ppr is_wf 1372 , text "guidance" <+> ppr guidance 1373 , extra_doc 1374 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"] 1375 1376 str = "Considering inlining: " ++ showSDocDump dflags (ppr id) 1377 n_val_args = length arg_infos 1378 1379 -- some_benefit is used when the RHS is small enough 1380 -- and the call has enough (or too many) value 1381 -- arguments (ie n_val_args >= arity). But there must 1382 -- be *something* interesting about some argument, or the 1383 -- result context, to make it worth inlining 1384 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args 1385 -- expected by the unfolding 1386 calc_some_benefit uf_arity 1387 | not saturated = interesting_args -- Under-saturated 1388 -- Note [Unsaturated applications] 1389 | otherwise = interesting_args -- Saturated or over-saturated 1390 || interesting_call 1391 where 1392 saturated = n_val_args >= uf_arity 1393 over_saturated = n_val_args > uf_arity 1394 interesting_args = any nonTriv arg_infos 1395 -- NB: (any nonTriv arg_infos) looks at the 1396 -- over-saturated args too which is "wrong"; 1397 -- but if over-saturated we inline anyway. 1398 1399 interesting_call 1400 | over_saturated 1401 = True 1402 | otherwise 1403 = case cont_info of 1404 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables] 1405 ValAppCtxt -> True -- Note [Cast then apply] 1406 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts] 1407 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt] 1408 RhsCtxt -> uf_arity > 0 -- 1409 _other -> False -- See Note [Nested functions] 1410 1411 1412{- 1413Note [Unfold into lazy contexts], Note [RHS of lets] 1414~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1415When the call is the argument of a function with a RULE, or the RHS of a let, 1416we are a little bit keener to inline. For example 1417 f y = (y,y,y) 1418 g y = let x = f y in ...(case x of (a,b,c) -> ...) ... 1419We'd inline 'f' if the call was in a case context, and it kind-of-is, 1420only we can't see it. Also 1421 x = f v 1422could be expensive whereas 1423 x = case v of (a,b) -> a 1424is patently cheap and may allow more eta expansion. 1425So we treat the RHS of a let as not-totally-boring. 1426 1427Note [Unsaturated applications] 1428~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1429When a call is not saturated, we *still* inline if one of the 1430arguments has interesting structure. That's sometimes very important. 1431A good example is the Ord instance for Bool in Base: 1432 1433 Rec { 1434 $fOrdBool =GHC.Classes.D:Ord 1435 @ Bool 1436 ... 1437 $cmin_ajX 1438 1439 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool 1440 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool 1441 } 1442 1443But the defn of GHC.Classes.$dmmin is: 1444 1445 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a 1446 {- Arity: 3, HasNoCafRefs, Strictness: SLL, 1447 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a -> 1448 case @ a GHC.Classes.<= @ a $dOrd x y of wild { 1449 GHC.Types.False -> y GHC.Types.True -> x }) -} 1450 1451We *really* want to inline $dmmin, even though it has arity 3, in 1452order to unravel the recursion. 1453 1454 1455Note [Things to watch] 1456~~~~~~~~~~~~~~~~~~~~~~ 1457* { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... } 1458 Assume x is exported, so not inlined unconditionally. 1459 Then we want x to inline unconditionally; no reason for it 1460 not to, and doing so avoids an indirection. 1461 1462* { x = I# 3; ....f x.... } 1463 Make sure that x does not inline unconditionally! 1464 Lest we get extra allocation. 1465 1466Note [Inlining an InlineRule] 1467~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1468An InlineRules is used for 1469 (a) programmer INLINE pragmas 1470 (b) inlinings from worker/wrapper 1471 1472For (a) the RHS may be large, and our contract is that we *only* inline 1473when the function is applied to all the arguments on the LHS of the 1474source-code defn. (The uf_arity in the rule.) 1475 1476However for worker/wrapper it may be worth inlining even if the 1477arity is not satisfied (as we do in the CoreUnfolding case) so we don't 1478require saturation. 1479 1480Note [Nested functions] 1481~~~~~~~~~~~~~~~~~~~~~~~ 1482At one time we treated a call of a non-top-level function as 1483"interesting" (regardless of how boring the context) in the hope 1484that inlining it would eliminate the binding, and its allocation. 1485Specifically, in the default case of interesting_call we had 1486 _other -> not is_top && uf_arity > 0 1487 1488But actually postInlineUnconditionally does some of this and overall 1489it makes virtually no difference to nofib. So I simplified away this 1490special case 1491 1492Note [Cast then apply] 1493~~~~~~~~~~~~~~~~~~~~~~ 1494Consider 1495 myIndex = __inline_me ( (/\a. <blah>) |> co ) 1496 co :: (forall a. a -> a) ~ (forall a. T a) 1497 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ... 1498 1499We need to inline myIndex to unravel this; but the actual call (myIndex a) has 1500no value arguments. The ValAppCtxt gives it enough incentive to inline. 1501 1502Note [Inlining in ArgCtxt] 1503~~~~~~~~~~~~~~~~~~~~~~~~~~ 1504The condition (arity > 0) here is very important, because otherwise 1505we end up inlining top-level stuff into useless places; eg 1506 x = I# 3# 1507 f = \y. g x 1508This can make a very big difference: it adds 16% to nofib 'integer' allocs, 1509and 20% to 'power'. 1510 1511At one stage I replaced this condition by 'True' (leading to the above 1512slow-down). The motivation was test eyeball/inline1.hs; but that seems 1513to work ok now. 1514 1515NOTE: arguably, we should inline in ArgCtxt only if the result of the 1516call is at least CONLIKE. At least for the cases where we use ArgCtxt 1517for the RHS of a 'let', we only profit from the inlining if we get a 1518CONLIKE thing (modulo lets). 1519 1520Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables] 1521~~~~~~~~~~~~~~~~~~~~~ which appears below 1522The "lone-variable" case is important. I spent ages messing about 1523with unsatisfactory variants, but this is nice. The idea is that if a 1524variable appears all alone 1525 1526 as an arg of lazy fn, or rhs BoringCtxt 1527 as scrutinee of a case CaseCtxt 1528 as arg of a fn ArgCtxt 1529AND 1530 it is bound to a cheap expression 1531 1532then we should not inline it (unless there is some other reason, 1533e.g. it is the sole occurrence). That is what is happening at 1534the use of 'lone_variable' in 'interesting_call'. 1535 1536Why? At least in the case-scrutinee situation, turning 1537 let x = (a,b) in case x of y -> ... 1538into 1539 let x = (a,b) in case (a,b) of y -> ... 1540and thence to 1541 let x = (a,b) in let y = (a,b) in ... 1542is bad if the binding for x will remain. 1543 1544Another example: I discovered that strings 1545were getting inlined straight back into applications of 'error' 1546because the latter is strict. 1547 s = "foo" 1548 f = \x -> ...(error s)... 1549 1550Fundamentally such contexts should not encourage inlining because, provided 1551the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the 1552context can ``see'' the unfolding of the variable (e.g. case or a 1553RULE) so there's no gain. 1554 1555However, watch out: 1556 1557 * Consider this: 1558 foo = _inline_ (\n. [n]) 1559 bar = _inline_ (foo 20) 1560 baz = \n. case bar of { (m:_) -> m + n } 1561 Here we really want to inline 'bar' so that we can inline 'foo' 1562 and the whole thing unravels as it should obviously do. This is 1563 important: in the NDP project, 'bar' generates a closure data 1564 structure rather than a list. 1565 1566 So the non-inlining of lone_variables should only apply if the 1567 unfolding is regarded as cheap; because that is when exprIsConApp_maybe 1568 looks through the unfolding. Hence the "&& is_wf" in the 1569 InlineRule branch. 1570 1571 * Even a type application or coercion isn't a lone variable. 1572 Consider 1573 case $fMonadST @ RealWorld of { :DMonad a b c -> c } 1574 We had better inline that sucker! The case won't see through it. 1575 1576 For now, I'm treating treating a variable applied to types 1577 in a *lazy* context "lone". The motivating example was 1578 f = /\a. \x. BIG 1579 g = /\a. \y. h (f a) 1580 There's no advantage in inlining f here, and perhaps 1581 a significant disadvantage. Hence some_val_args in the Stop case 1582 1583Note [Interaction of exprIsWorkFree and lone variables] 1584~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1585The lone-variable test says "don't inline if a case expression 1586scrutinises a lone variable whose unfolding is cheap". It's very 1587important that, under these circumstances, exprIsConApp_maybe 1588can spot a constructor application. So, for example, we don't 1589consider 1590 let x = e in (x,x) 1591to be cheap, and that's good because exprIsConApp_maybe doesn't 1592think that expression is a constructor application. 1593 1594In the 'not (lone_variable && is_wf)' test, I used to test is_value 1595rather than is_wf, which was utterly wrong, because the above 1596expression responds True to exprIsHNF, which is what sets is_value. 1597 1598This kind of thing can occur if you have 1599 1600 {-# INLINE foo #-} 1601 foo = let x = e in (x,x) 1602 1603which Roman did. 1604 1605 1606-} 1607 1608computeDiscount :: [Int] -> Int -> [ArgSummary] -> CallCtxt 1609 -> Int 1610computeDiscount arg_discounts res_discount arg_infos cont_info 1611 1612 = 10 -- Discount of 10 because the result replaces the call 1613 -- so we count 10 for the function itself 1614 1615 + 10 * length actual_arg_discounts 1616 -- Discount of 10 for each arg supplied, 1617 -- because the result replaces the call 1618 1619 + total_arg_discount + res_discount' 1620 where 1621 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos 1622 total_arg_discount = sum actual_arg_discounts 1623 1624 mk_arg_discount _ TrivArg = 0 1625 mk_arg_discount _ NonTrivArg = 10 1626 mk_arg_discount discount ValueArg = discount 1627 1628 res_discount' 1629 | LT <- arg_discounts `compareLength` arg_infos 1630 = res_discount -- Over-saturated 1631 | otherwise 1632 = case cont_info of 1633 BoringCtxt -> 0 1634 CaseCtxt -> res_discount -- Presumably a constructor 1635 ValAppCtxt -> res_discount -- Presumably a function 1636 _ -> 40 `min` res_discount 1637 -- ToDo: this 40 `min` res_discount doesn't seem right 1638 -- for DiscArgCtxt it shouldn't matter because the function will 1639 -- get the arg discount for any non-triv arg 1640 -- for RuleArgCtxt we do want to be keener to inline; but not only 1641 -- constructor results 1642 -- for RhsCtxt I suppose that exposing a data con is good in general 1643 -- And 40 seems very arbitrary 1644 -- 1645 -- res_discount can be very large when a function returns 1646 -- constructors; but we only want to invoke that large discount 1647 -- when there's a case continuation. 1648 -- Otherwise we, rather arbitrarily, threshold it. Yuk. 1649 -- But we want to avoid inlining large functions that return 1650 -- constructors into contexts that are simply "interesting" 1651