1{- 2(c) The University of Glasgow 2006 3(c) The AQUA Project, Glasgow University, 1994-1998 4 5 6Core-syntax unfoldings 7 8Unfoldings (which can travel across module boundaries) are in Core 9syntax (namely @CoreExpr@s). 10 11The type @Unfolding@ sits ``above'' simply-Core-expressions 12unfoldings, capturing ``higher-level'' things we know about a binding, 13usually things that the simplifier found out (e.g., ``it's a 14literal''). In the corner of a @CoreUnfolding@ unfolding, you will 15find, unsurprisingly, a Core expression. 16-} 17 18{-# LANGUAGE CPP #-} 19 20module CoreUnfold ( 21 Unfolding, UnfoldingGuidance, -- Abstract types 22 23 noUnfolding, mkImplicitUnfolding, 24 mkUnfolding, mkCoreUnfolding, 25 mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding, 26 mkInlineUnfolding, mkInlineUnfoldingWithArity, 27 mkInlinableUnfolding, mkWwInlineRule, 28 mkCompulsoryUnfolding, mkDFunUnfolding, 29 specUnfolding, 30 31 ArgSummary(..), 32 33 couldBeSmallEnoughToInline, inlineBoringOk, 34 certainlyWillInline, smallEnoughToInline, 35 36 callSiteInline, CallCtxt(..), 37 38 -- Reexport from CoreSubst (it only live there so it can be used 39 -- by the Very Simple Optimiser) 40 exprIsConApp_maybe, exprIsLiteral_maybe 41 ) where 42 43#include "HsVersions.h" 44 45import GhcPrelude 46 47import DynFlags 48import CoreSyn 49import OccurAnal ( occurAnalyseExpr_NoBinderSwap ) 50import CoreOpt 51import CoreArity ( manifestArity ) 52import CoreUtils 53import Id 54import Demand ( isBottomingSig ) 55import DataCon 56import Literal 57import PrimOp 58import IdInfo 59import BasicTypes ( Arity, InlineSpec(..), inlinePragmaSpec ) 60import Type 61import PrelNames 62import TysPrim ( realWorldStatePrimTy ) 63import Bag 64import Util 65import Outputable 66import ForeignCall 67import Name 68 69import qualified Data.ByteString as BS 70import Data.List 71 72{- 73************************************************************************ 74* * 75\subsection{Making unfoldings} 76* * 77************************************************************************ 78-} 79 80mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding 81mkTopUnfolding dflags is_bottoming rhs 82 = mkUnfolding dflags InlineRhs True is_bottoming rhs 83 84mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding 85-- For implicit Ids, do a tiny bit of optimising first 86mkImplicitUnfolding dflags expr 87 = mkTopUnfolding dflags False (simpleOptExpr dflags expr) 88 89-- Note [Top-level flag on inline rules] 90-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 91-- Slight hack: note that mk_inline_rules conservatively sets the 92-- top-level flag to True. It gets set more accurately by the simplifier 93-- Simplify.simplUnfolding. 94 95mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding 96mkSimpleUnfolding dflags rhs 97 = mkUnfolding dflags InlineRhs False False rhs 98 99mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding 100mkDFunUnfolding bndrs con ops 101 = DFunUnfolding { df_bndrs = bndrs 102 , df_con = con 103 , df_args = map occurAnalyseExpr_NoBinderSwap ops } 104 -- See Note [Occurrence analysis of unfoldings] 105 106mkWwInlineRule :: DynFlags -> CoreExpr -> Arity -> Unfolding 107mkWwInlineRule dflags expr arity 108 = mkCoreUnfolding InlineStable True 109 (simpleOptExpr dflags expr) 110 (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk 111 , ug_boring_ok = boringCxtNotOk }) 112 113mkCompulsoryUnfolding :: CoreExpr -> Unfolding 114mkCompulsoryUnfolding expr -- Used for things that absolutely must be unfolded 115 = mkCoreUnfolding InlineCompulsory True 116 (simpleOptExpr unsafeGlobalDynFlags expr) 117 (UnfWhen { ug_arity = 0 -- Arity of unfolding doesn't matter 118 , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk }) 119 120mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding 121-- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap 122mkWorkerUnfolding dflags work_fn 123 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl 124 , uf_is_top = top_lvl }) 125 | isStableSource src 126 = mkCoreUnfolding src top_lvl new_tmpl guidance 127 where 128 new_tmpl = simpleOptExpr dflags (work_fn tmpl) 129 guidance = calcUnfoldingGuidance dflags False new_tmpl 130 131mkWorkerUnfolding _ _ _ = noUnfolding 132 133-- | Make an unfolding that may be used unsaturated 134-- (ug_unsat_ok = unSaturatedOk) and that is reported as having its 135-- manifest arity (the number of outer lambdas applications will 136-- resolve before doing any work). 137mkInlineUnfolding :: CoreExpr -> Unfolding 138mkInlineUnfolding expr 139 = mkCoreUnfolding InlineStable 140 True -- Note [Top-level flag on inline rules] 141 expr' guide 142 where 143 expr' = simpleOptExpr unsafeGlobalDynFlags expr 144 guide = UnfWhen { ug_arity = manifestArity expr' 145 , ug_unsat_ok = unSaturatedOk 146 , ug_boring_ok = boring_ok } 147 boring_ok = inlineBoringOk expr' 148 149-- | Make an unfolding that will be used once the RHS has been saturated 150-- to the given arity. 151mkInlineUnfoldingWithArity :: Arity -> CoreExpr -> Unfolding 152mkInlineUnfoldingWithArity arity expr 153 = mkCoreUnfolding InlineStable 154 True -- Note [Top-level flag on inline rules] 155 expr' guide 156 where 157 expr' = simpleOptExpr unsafeGlobalDynFlags expr 158 guide = UnfWhen { ug_arity = arity 159 , ug_unsat_ok = needSaturated 160 , ug_boring_ok = boring_ok } 161 -- See Note [INLINE pragmas and boring contexts] as to why we need to look 162 -- at the arity here. 163 boring_ok | arity == 0 = True 164 | otherwise = inlineBoringOk expr' 165 166mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding 167mkInlinableUnfolding dflags expr 168 = mkUnfolding dflags InlineStable False False expr' 169 where 170 expr' = simpleOptExpr dflags expr 171 172specUnfolding :: DynFlags 173 -> [Var] -> (CoreExpr -> CoreExpr) 174 -> [CoreArg] -- LHS arguments in the RULE 175 -> Unfolding -> Unfolding 176-- See Note [Specialising unfoldings] 177-- specUnfolding spec_bndrs spec_args unf 178-- = \spec_bndrs. unf spec_args 179-- 180specUnfolding dflags spec_bndrs spec_app rule_lhs_args 181 df@(DFunUnfolding { df_bndrs = old_bndrs, df_con = con, df_args = args }) 182 = ASSERT2( rule_lhs_args `equalLength` old_bndrs 183 , ppr df $$ ppr rule_lhs_args ) 184 -- For this ASSERT see Note [DFunUnfoldings] in GHC.Core.Opt.Specialise 185 mkDFunUnfolding spec_bndrs con (map spec_arg args) 186 -- For DFunUnfoldings we transform 187 -- \obs. MkD <op1> ... <opn> 188 -- to 189 -- \sbs. MkD ((\obs. <op1>) spec_args) ... ditto <opn> 190 where 191 spec_arg arg = simpleOptExpr dflags $ 192 spec_app (mkLams old_bndrs arg) 193 -- The beta-redexes created by spec_app will be 194 -- simplified away by simplOptExpr 195 196specUnfolding dflags spec_bndrs spec_app rule_lhs_args 197 (CoreUnfolding { uf_src = src, uf_tmpl = tmpl 198 , uf_is_top = top_lvl 199 , uf_guidance = old_guidance }) 200 | isStableSource src -- See Note [Specialising unfoldings] 201 , UnfWhen { ug_arity = old_arity } <- old_guidance 202 = mkCoreUnfolding src top_lvl new_tmpl 203 (old_guidance { ug_arity = old_arity - arity_decrease }) 204 where 205 new_tmpl = simpleOptExpr dflags $ 206 mkLams spec_bndrs $ 207 spec_app tmpl -- The beta-redexes created by spec_app 208 -- will besimplified away by simplOptExpr 209 arity_decrease = count isValArg rule_lhs_args - count isId spec_bndrs 210 211 212specUnfolding _ _ _ _ _ = noUnfolding 213 214{- Note [Specialising unfoldings] 215~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 216When we specialise a function for some given type-class arguments, we use 217specUnfolding to specialise its unfolding. Some important points: 218 219* If the original function has a DFunUnfolding, the specialised one 220 must do so too! Otherwise we lose the magic rules that make it 221 interact with ClassOps 222 223* There is a bit of hack for INLINABLE functions: 224 f :: Ord a => .... 225 f = <big-rhs> 226 {- INLINABLE f #-} 227 Now if we specialise f, should the specialised version still have 228 an INLINABLE pragma? If it does, we'll capture a specialised copy 229 of <big-rhs> as its unfolding, and that probaby won't inline. But 230 if we don't, the specialised version of <big-rhs> might be small 231 enough to inline at a call site. This happens with Control.Monad.liftM3, 232 and can cause a lot more allocation as a result (nofib n-body shows this). 233 234 Moreover, keeping the INLINABLE thing isn't much help, because 235 the specialised function (probaby) isn't overloaded any more. 236 237 Conclusion: drop the INLINEALE pragma. In practice what this means is: 238 if a stable unfolding has UnfoldingGuidance of UnfWhen, 239 we keep it (so the specialised thing too will always inline) 240 if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs 241 (which arises from INLINABLE), we discard it 242 243Note [Honour INLINE on 0-ary bindings] 244~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 245Consider 246 247 x = <expensive> 248 {-# INLINE x #-} 249 250 f y = ...x... 251 252The semantics of an INLINE pragma is 253 254 inline x at every call site, provided it is saturated; 255 that is, applied to at least as many arguments as appear 256 on the LHS of the Haskell source definition. 257 258(This soure-code-derived arity is stored in the `ug_arity` field of 259the `UnfoldingGuidance`.) 260 261In the example, x's ug_arity is 0, so we should inline it at every use 262site. It's rare to have such an INLINE pragma (usually INLINE Is on 263functions), but it's occasionally very important (#15578, #15519). 264In #15519 we had something like 265 x = case (g a b) of I# r -> T r 266 {-# INLINE x #-} 267 f y = ...(h x).... 268 269where h is strict. So we got 270 f y = ...(case g a b of I# r -> h (T r))... 271 272and that in turn allowed SpecConstr to ramp up performance. 273 274How do we deliver on this? By adjusting the ug_boring_ok 275flag in mkInlineUnfoldingWithArity; see 276Note [INLINE pragmas and boring contexts] 277 278NB: there is a real risk that full laziness will float it right back 279out again. Consider again 280 x = factorial 200 281 {-# INLINE x #-} 282 f y = ...x... 283 284After inlining we get 285 f y = ...(factorial 200)... 286 287but it's entirely possible that full laziness will do 288 lvl23 = factorial 200 289 f y = ...lvl23... 290 291That's a problem for another day. 292 293Note [INLINE pragmas and boring contexts] 294~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 295An INLINE pragma uses mkInlineUnfoldingWithArity to build the 296unfolding. That sets the ug_boring_ok flag to False if the function 297is not tiny (inlineBoringOK), so that even INLINE functions are not 298inlined in an utterly boring context. E.g. 299 \x y. Just (f y x) 300Nothing is gained by inlining f here, even if it has an INLINE 301pragma. 302 303But for 0-ary bindings, we want to inline regardless; see 304Note [Honour INLINE on 0-ary bindings]. 305 306I'm a bit worried that it's possible for the same kind of problem 307to arise for non-0-ary functions too, but let's wait and see. 308-} 309 310mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr 311 -> UnfoldingGuidance -> Unfolding 312-- Occurrence-analyses the expression before capturing it 313mkCoreUnfolding src top_lvl expr guidance 314 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr, 315 -- See Note [Occurrence analysis of unfoldings] 316 uf_src = src, 317 uf_is_top = top_lvl, 318 uf_is_value = exprIsHNF expr, 319 uf_is_conlike = exprIsConLike expr, 320 uf_is_work_free = exprIsWorkFree expr, 321 uf_expandable = exprIsExpandable expr, 322 uf_guidance = guidance } 323 324mkUnfolding :: DynFlags -> UnfoldingSource 325 -> Bool -- Is top-level 326 -> Bool -- Definitely a bottoming binding 327 -- (only relevant for top-level bindings) 328 -> CoreExpr 329 -> Unfolding 330-- Calculates unfolding guidance 331-- Occurrence-analyses the expression before capturing it 332mkUnfolding dflags src is_top_lvl is_bottoming expr 333 = CoreUnfolding { uf_tmpl = occurAnalyseExpr_NoBinderSwap expr, 334 -- See Note [Occurrence analysis of unfoldings] 335 uf_src = src, 336 uf_is_top = is_top_lvl, 337 uf_is_value = exprIsHNF expr, 338 uf_is_conlike = exprIsConLike expr, 339 uf_expandable = exprIsExpandable expr, 340 uf_is_work_free = exprIsWorkFree expr, 341 uf_guidance = guidance } 342 where 343 is_top_bottoming = is_top_lvl && is_bottoming 344 guidance = calcUnfoldingGuidance dflags is_top_bottoming expr 345 -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr_NoBinderSwap expr))! 346 -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression] 347 348{- 349Note [Occurrence analysis of unfoldings] 350~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 351We do occurrence-analysis of unfoldings once and for all, when the 352unfolding is built, rather than each time we inline them. 353 354But given this decision it's vital that we do 355*always* do it. Consider this unfolding 356 \x -> letrec { f = ...g...; g* = f } in body 357where g* is (for some strange reason) the loop breaker. If we don't 358occ-anal it when reading it in, we won't mark g as a loop breaker, and 359we may inline g entirely in body, dropping its binding, and leaving 360the occurrence in f out of scope. This happened in #8892, where 361the unfolding in question was a DFun unfolding. 362 363But more generally, the simplifier is designed on the 364basis that it is looking at occurrence-analysed expressions, so better 365ensure that they acutally are. 366 367We use occurAnalyseExpr_NoBinderSwap instead of occurAnalyseExpr; 368see Note [No binder swap in unfoldings]. 369 370Note [No binder swap in unfoldings] 371~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 372The binder swap can temporarily violate Core Lint, by assinging 373a LocalId binding to a GlobalId. For example, if A.foo{r872} 374is a GlobalId with unique r872, then 375 376 case A.foo{r872} of bar { 377 K x -> ...(A.foo{r872})... 378 } 379 380gets transformed to 381 382 case A.foo{r872} of bar { 383 K x -> let foo{r872} = bar 384 in ...(A.foo{r872})... 385 386This is usually not a problem, because the simplifier will transform 387this to: 388 389 case A.foo{r872} of bar { 390 K x -> ...(bar)... 391 392However, after occurrence analysis but before simplification, this extra 'let' 393violates the Core Lint invariant that we do not have local 'let' bindings for 394GlobalIds. That seems (just) tolerable for the occurrence analysis that happens 395just before the Simplifier, but not for unfoldings, which are Linted 396independently. 397As a quick workaround, we disable binder swap in this module. 398See #16288 and #16296 for further plans. 399 400Note [Calculate unfolding guidance on the non-occ-anal'd expression] 401~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 402Notice that we give the non-occur-analysed expression to 403calcUnfoldingGuidance. In some ways it'd be better to occur-analyse 404first; for example, sometimes during simplification, there's a large 405let-bound thing which has been substituted, and so is now dead; so 406'expr' contains two copies of the thing while the occurrence-analysed 407expression doesn't. 408 409Nevertheless, we *don't* and *must not* occ-analyse before computing 410the size because 411 412a) The size computation bales out after a while, whereas occurrence 413 analysis does not. 414 415b) Residency increases sharply if you occ-anal first. I'm not 416 100% sure why, but it's a large effect. Compiling Cabal went 417 from residency of 534M to over 800M with this one change. 418 419This can occasionally mean that the guidance is very pessimistic; 420it gets fixed up next round. And it should be rare, because large 421let-bound things that are dead are usually caught by preInlineUnconditionally 422 423 424************************************************************************ 425* * 426\subsection{The UnfoldingGuidance type} 427* * 428************************************************************************ 429-} 430 431inlineBoringOk :: CoreExpr -> Bool 432-- See Note [INLINE for small functions] 433-- True => the result of inlining the expression is 434-- no bigger than the expression itself 435-- eg (\x y -> f y x) 436-- This is a quick and dirty version. It doesn't attempt 437-- to deal with (\x y z -> x (y z)) 438-- The really important one is (x `cast` c) 439inlineBoringOk e 440 = go 0 e 441 where 442 go :: Int -> CoreExpr -> Bool 443 go credit (Lam x e) | isId x = go (credit+1) e 444 | otherwise = go credit e 445 -- See Note [Count coercion arguments in boring contexts] 446 go credit (App f (Type {})) = go credit f 447 go credit (App f a) | credit > 0 448 , exprIsTrivial a = go (credit-1) f 449 go credit (Tick _ e) = go credit e -- dubious 450 go credit (Cast e _) = go credit e 451 go _ (Var {}) = boringCxtOk 452 go _ _ = boringCxtNotOk 453 454calcUnfoldingGuidance 455 :: DynFlags 456 -> Bool -- Definitely a top-level, bottoming binding 457 -> CoreExpr -- Expression to look at 458 -> UnfoldingGuidance 459calcUnfoldingGuidance dflags is_top_bottoming (Tick t expr) 460 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding 461 = calcUnfoldingGuidance dflags is_top_bottoming expr 462calcUnfoldingGuidance dflags is_top_bottoming expr 463 = case sizeExpr dflags bOMB_OUT_SIZE val_bndrs body of 464 TooBig -> UnfNever 465 SizeIs size cased_bndrs scrut_discount 466 | uncondInline expr n_val_bndrs size 467 -> UnfWhen { ug_unsat_ok = unSaturatedOk 468 , ug_boring_ok = boringCxtOk 469 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions] 470 471 | is_top_bottoming 472 -> UnfNever -- See Note [Do not inline top-level bottoming functions] 473 474 | otherwise 475 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs 476 , ug_size = size 477 , ug_res = scrut_discount } 478 479 where 480 (bndrs, body) = collectBinders expr 481 bOMB_OUT_SIZE = ufCreationThreshold dflags 482 -- Bomb out if size gets bigger than this 483 val_bndrs = filter isId bndrs 484 n_val_bndrs = length val_bndrs 485 486 mk_discount :: Bag (Id,Int) -> Id -> Int 487 mk_discount cbs bndr = foldl' combine 0 cbs 488 where 489 combine acc (bndr', disc) 490 | bndr == bndr' = acc `plus_disc` disc 491 | otherwise = acc 492 493 plus_disc :: Int -> Int -> Int 494 plus_disc | isFunTy (idType bndr) = max 495 | otherwise = (+) 496 -- See Note [Function and non-function discounts] 497 498{- 499Note [Computing the size of an expression] 500~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 501The basic idea of sizeExpr is obvious enough: count nodes. But getting the 502heuristics right has taken a long time. Here's the basic strategy: 503 504 * Variables, literals: 0 505 (Exception for string literals, see litSize.) 506 507 * Function applications (f e1 .. en): 1 + #value args 508 509 * Constructor applications: 1, regardless of #args 510 511 * Let(rec): 1 + size of components 512 513 * Note, cast: 0 514 515Examples 516 517 Size Term 518 -------------- 519 0 42# 520 0 x 521 0 True 522 2 f x 523 1 Just x 524 4 f (g x) 525 526Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's 527a function call to account for. Notice also that constructor applications 528are very cheap, because exposing them to a caller is so valuable. 529 530[25/5/11] All sizes are now multiplied by 10, except for primops 531(which have sizes like 1 or 4. This makes primops look fantastically 532cheap, and seems to be almost unversally beneficial. Done partly as a 533result of #4978. 534 535Note [Do not inline top-level bottoming functions] 536~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 537The FloatOut pass has gone to some trouble to float out calls to 'error' 538and similar friends. See Note [Bottoming floats] in SetLevels. 539Do not re-inline them! But we *do* still inline if they are very small 540(the uncondInline stuff). 541 542Note [INLINE for small functions] 543~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 544Consider {-# INLINE f #-} 545 f x = Just x 546 g y = f y 547Then f's RHS is no larger than its LHS, so we should inline it into 548even the most boring context. In general, f the function is 549sufficiently small that its body is as small as the call itself, the 550inline unconditionally, regardless of how boring the context is. 551 552Things to note: 553 554(1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr) 555 than the thing it's replacing. Notice that 556 (f x) --> (g 3) -- YES, unconditionally 557 (f x) --> x : [] -- YES, *even though* there are two 558 -- arguments to the cons 559 x --> g 3 -- NO 560 x --> Just v -- NO 561 562 It's very important not to unconditionally replace a variable by 563 a non-atomic term. 564 565(2) We do this even if the thing isn't saturated, else we end up with the 566 silly situation that 567 f x y = x 568 ...map (f 3)... 569 doesn't inline. Even in a boring context, inlining without being 570 saturated will give a lambda instead of a PAP, and will be more 571 efficient at runtime. 572 573(3) However, when the function's arity > 0, we do insist that it 574 has at least one value argument at the call site. (This check is 575 made in the UnfWhen case of callSiteInline.) Otherwise we find this: 576 f = /\a \x:a. x 577 d = /\b. MkD (f b) 578 If we inline f here we get 579 d = /\b. MkD (\x:b. x) 580 and then prepareRhs floats out the argument, abstracting the type 581 variables, so we end up with the original again! 582 583(4) We must be much more cautious about arity-zero things. Consider 584 let x = y +# z in ... 585 In *size* terms primops look very small, because the generate a 586 single instruction, but we do not want to unconditionally replace 587 every occurrence of x with (y +# z). So we only do the 588 unconditional-inline thing for *trivial* expressions. 589 590 NB: you might think that PostInlineUnconditionally would do this 591 but it doesn't fire for top-level things; see SimplUtils 592 Note [Top level and postInlineUnconditionally] 593 594Note [Count coercion arguments in boring contexts] 595~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 596In inlineBoringOK, we ignore type arguments when deciding whether an 597expression is okay to inline into boring contexts. This is good, since 598if we have a definition like 599 600 let y = x @Int in f y y 601 602there’s no reason not to inline y at both use sites — no work is 603actually duplicated. It may seem like the same reasoning applies to 604coercion arguments, and indeed, in #17182 we changed inlineBoringOK to 605treat coercions the same way. 606 607However, this isn’t a good idea: unlike type arguments, which have 608no runtime representation, coercion arguments *do* have a runtime 609representation (albeit the zero-width VoidRep, see Note [Coercion tokens] 610in CoreToStg.hs). This caused trouble in #17787 for DataCon wrappers for 611nullary GADT constructors: the wrappers would be inlined and each use of 612the constructor would lead to a separate allocation instead of just 613sharing the wrapper closure. 614 615The solution: don’t ignore coercion arguments after all. 616-} 617 618uncondInline :: CoreExpr -> Arity -> Int -> Bool 619-- Inline unconditionally if there no size increase 620-- Size of call is arity (+1 for the function) 621-- See Note [INLINE for small functions] 622uncondInline rhs arity size 623 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1) 624 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4) 625 626sizeExpr :: DynFlags 627 -> Int -- Bomb out if it gets bigger than this 628 -> [Id] -- Arguments; we're interested in which of these 629 -- get case'd 630 -> CoreExpr 631 -> ExprSize 632 633-- Note [Computing the size of an expression] 634 635sizeExpr dflags bOMB_OUT_SIZE top_args expr 636 = size_up expr 637 where 638 size_up (Cast e _) = size_up e 639 size_up (Tick _ e) = size_up e 640 size_up (Type _) = sizeZero -- Types cost nothing 641 size_up (Coercion _) = sizeZero 642 size_up (Lit lit) = sizeN (litSize lit) 643 size_up (Var f) | isRealWorldId f = sizeZero 644 -- Make sure we get constructor discounts even 645 -- on nullary constructors 646 | otherwise = size_up_call f [] 0 647 648 size_up (App fun arg) 649 | isTyCoArg arg = size_up fun 650 | otherwise = size_up arg `addSizeNSD` 651 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0) 652 653 size_up (Lam b e) 654 | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10) 655 | otherwise = size_up e 656 657 size_up (Let (NonRec binder rhs) body) 658 = size_up_rhs (binder, rhs) `addSizeNSD` 659 size_up body `addSizeN` 660 size_up_alloc binder 661 662 size_up (Let (Rec pairs) body) 663 = foldr (addSizeNSD . size_up_rhs) 664 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs)) 665 pairs 666 667 size_up (Case e _ _ alts) 668 | null alts 669 = size_up e -- case e of {} never returns, so take size of scrutinee 670 671 size_up (Case e _ _ alts) 672 -- Now alts is non-empty 673 | Just v <- is_top_arg e -- We are scrutinising an argument variable 674 = let 675 alt_sizes = map size_up_alt alts 676 677 -- alts_size tries to compute a good discount for 678 -- the case when we are scrutinising an argument variable 679 alts_size (SizeIs tot tot_disc tot_scrut) 680 -- Size of all alternatives 681 (SizeIs max _ _) 682 -- Size of biggest alternative 683 = SizeIs tot (unitBag (v, 20 + tot - max) 684 `unionBags` tot_disc) tot_scrut 685 -- If the variable is known, we produce a 686 -- discount that will take us back to 'max', 687 -- the size of the largest alternative The 688 -- 1+ is a little discount for reduced 689 -- allocation in the caller 690 -- 691 -- Notice though, that we return tot_disc, 692 -- the total discount from all branches. I 693 -- think that's right. 694 695 alts_size tot_size _ = tot_size 696 in 697 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty 698 (foldr1 maxSize alt_sizes) 699 -- Good to inline if an arg is scrutinised, because 700 -- that may eliminate allocation in the caller 701 -- And it eliminates the case itself 702 where 703 is_top_arg (Var v) | v `elem` top_args = Just v 704 is_top_arg (Cast e _) = is_top_arg e 705 is_top_arg _ = Nothing 706 707 708 size_up (Case e _ _ alts) = size_up e `addSizeNSD` 709 foldr (addAltSize . size_up_alt) case_size alts 710 where 711 case_size 712 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10) 713 | otherwise = sizeZero 714 -- Normally we don't charge for the case itself, but 715 -- we charge one per alternative (see size_up_alt, 716 -- below) to account for the cost of the info table 717 -- and comparisons. 718 -- 719 -- However, in certain cases (see is_inline_scrut 720 -- below), no code is generated for the case unless 721 -- there are multiple alts. In these cases we 722 -- subtract one, making the first alt free. 723 -- e.g. case x# +# y# of _ -> ... should cost 1 724 -- case touch# x# of _ -> ... should cost 0 725 -- (see #4978) 726 -- 727 -- I would like to not have the "lengthAtMost alts 1" 728 -- condition above, but without that some programs got worse 729 -- (spectral/hartel/event and spectral/para). I don't fully 730 -- understand why. (SDM 24/5/11) 731 732 -- unboxed variables, inline primops and unsafe foreign calls 733 -- are all "inline" things: 734 is_inline_scrut (Var v) = isUnliftedType (idType v) 735 is_inline_scrut scrut 736 | (Var f, _) <- collectArgs scrut 737 = case idDetails f of 738 FCallId fc -> not (isSafeForeignCall fc) 739 PrimOpId op -> not (primOpOutOfLine op) 740 _other -> False 741 | otherwise 742 = False 743 744 size_up_rhs (bndr, rhs) 745 | Just join_arity <- isJoinId_maybe bndr 746 -- Skip arguments to join point 747 , (_bndrs, body) <- collectNBinders join_arity rhs 748 = size_up body 749 | otherwise 750 = size_up rhs 751 752 ------------ 753 -- size_up_app is used when there's ONE OR MORE value args 754 size_up_app (App fun arg) args voids 755 | isTyCoArg arg = size_up_app fun args voids 756 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1) 757 | otherwise = size_up arg `addSizeNSD` 758 size_up_app fun (arg:args) voids 759 size_up_app (Var fun) args voids = size_up_call fun args voids 760 size_up_app (Tick _ expr) args voids = size_up_app expr args voids 761 size_up_app (Cast expr _) args voids = size_up_app expr args voids 762 size_up_app other args voids = size_up other `addSizeN` 763 callSize (length args) voids 764 -- if the lhs is not an App or a Var, or an invisible thing like a 765 -- Tick or Cast, then we should charge for a complete call plus the 766 -- size of the lhs itself. 767 768 ------------ 769 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize 770 size_up_call fun val_args voids 771 = case idDetails fun of 772 FCallId _ -> sizeN (callSize (length val_args) voids) 773 DataConWorkId dc -> conSize dc (length val_args) 774 PrimOpId op -> primOpSize op (length val_args) 775 ClassOpId _ -> classOpSize dflags top_args val_args 776 _ -> funSize dflags top_args fun (length val_args) voids 777 778 ------------ 779 size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10 780 -- Don't charge for args, so that wrappers look cheap 781 -- (See comments about wrappers with Case) 782 -- 783 -- IMPORTANT: *do* charge 1 for the alternative, else we 784 -- find that giant case nests are treated as practically free 785 -- A good example is Foreign.C.Error.errnoToIOError 786 787 ------------ 788 -- Cost to allocate binding with given binder 789 size_up_alloc bndr 790 | isTyVar bndr -- Doesn't exist at runtime 791 || isJoinId bndr -- Not allocated at all 792 || isUnliftedType (idType bndr) -- Doesn't live in heap 793 = 0 794 | otherwise 795 = 10 796 797 ------------ 798 -- These addSize things have to be here because 799 -- I don't want to give them bOMB_OUT_SIZE as an argument 800 addSizeN TooBig _ = TooBig 801 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d 802 803 -- addAltSize is used to add the sizes of case alternatives 804 addAltSize TooBig _ = TooBig 805 addAltSize _ TooBig = TooBig 806 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2) 807 = mkSizeIs bOMB_OUT_SIZE (n1 + n2) 808 (xs `unionBags` ys) 809 (d1 + d2) -- Note [addAltSize result discounts] 810 811 -- This variant ignores the result discount from its LEFT argument 812 -- It's used when the second argument isn't part of the result 813 addSizeNSD TooBig _ = TooBig 814 addSizeNSD _ TooBig = TooBig 815 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2) 816 = mkSizeIs bOMB_OUT_SIZE (n1 + n2) 817 (xs `unionBags` ys) 818 d2 -- Ignore d1 819 820 isRealWorldId id = idType id `eqType` realWorldStatePrimTy 821 822 -- an expression of type State# RealWorld must be a variable 823 isRealWorldExpr (Var id) = isRealWorldId id 824 isRealWorldExpr (Tick _ e) = isRealWorldExpr e 825 isRealWorldExpr _ = False 826 827-- | Finds a nominal size of a string literal. 828litSize :: Literal -> Int 829-- Used by CoreUnfold.sizeExpr 830litSize (LitNumber LitNumInteger _ _) = 100 -- Note [Size of literal integers] 831litSize (LitNumber LitNumNatural _ _) = 100 832litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4) 833 -- If size could be 0 then @f "x"@ might be too small 834 -- [Sept03: make literal strings a bit bigger to avoid fruitless 835 -- duplication of little strings] 836litSize _other = 0 -- Must match size of nullary constructors 837 -- Key point: if x |-> 4, then x must inline unconditionally 838 -- (eg via case binding) 839 840classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize 841-- See Note [Conlike is interesting] 842classOpSize _ _ [] 843 = sizeZero 844classOpSize dflags top_args (arg1 : other_args) 845 = SizeIs size arg_discount 0 846 where 847 size = 20 + (10 * length other_args) 848 -- If the class op is scrutinising a lambda bound dictionary then 849 -- give it a discount, to encourage the inlining of this function 850 -- The actual discount is rather arbitrarily chosen 851 arg_discount = case arg1 of 852 Var dict | dict `elem` top_args 853 -> unitBag (dict, ufDictDiscount dflags) 854 _other -> emptyBag 855 856-- | The size of a function call 857callSize 858 :: Int -- ^ number of value args 859 -> Int -- ^ number of value args that are void 860 -> Int 861callSize n_val_args voids = 10 * (1 + n_val_args - voids) 862 -- The 1+ is for the function itself 863 -- Add 1 for each non-trivial arg; 864 -- the allocation cost, as in let(rec) 865 866-- | The size of a jump to a join point 867jumpSize 868 :: Int -- ^ number of value args 869 -> Int -- ^ number of value args that are void 870 -> Int 871jumpSize n_val_args voids = 2 * (1 + n_val_args - voids) 872 -- A jump is 20% the size of a function call. Making jumps free reopens 873 -- bug #6048, but making them any more expensive loses a 21% improvement in 874 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a 875 -- better solution? 876 877funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize 878-- Size for functions that are not constructors or primops 879-- Note [Function applications] 880funSize dflags top_args fun n_val_args voids 881 | fun `hasKey` buildIdKey = buildSize 882 | fun `hasKey` augmentIdKey = augmentSize 883 | otherwise = SizeIs size arg_discount res_discount 884 where 885 some_val_args = n_val_args > 0 886 is_join = isJoinId fun 887 888 size | is_join = jumpSize n_val_args voids 889 | not some_val_args = 0 890 | otherwise = callSize n_val_args voids 891 892 -- DISCOUNTS 893 -- See Note [Function and non-function discounts] 894 arg_discount | some_val_args && fun `elem` top_args 895 = unitBag (fun, ufFunAppDiscount dflags) 896 | otherwise = emptyBag 897 -- If the function is an argument and is applied 898 -- to some values, give it an arg-discount 899 900 res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags 901 | otherwise = 0 902 -- If the function is partially applied, show a result discount 903-- XXX maybe behave like ConSize for eval'd variable 904 905conSize :: DataCon -> Int -> ExprSize 906conSize dc n_val_args 907 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables 908 909-- See Note [Unboxed tuple size and result discount] 910 | isUnboxedTupleCon dc = SizeIs 0 emptyBag (10 * (1 + n_val_args)) 911 912-- See Note [Constructor size and result discount] 913 | otherwise = SizeIs 10 emptyBag (10 * (1 + n_val_args)) 914 915-- XXX still looks to large to me 916 917{- 918Note [Constructor size and result discount] 919~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 920Treat a constructors application as size 10, regardless of how many 921arguments it has; we are keen to expose them (and we charge separately 922for their args). We can't treat them as size zero, else we find that 923(Just x) has size 0, which is the same as a lone variable; and hence 924'v' will always be replaced by (Just x), where v is bound to Just x. 925 926The "result discount" is applied if the result of the call is 927scrutinised (say by a case). For a constructor application that will 928mean the constructor application will disappear, so we don't need to 929charge it to the function. So the discount should at least match the 930cost of the constructor application, namely 10. But to give a bit 931of extra incentive we give a discount of 10*(1 + n_val_args). 932 933Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)), 934and said it was an "unambiguous win", but its terribly dangerous 935because a function with many many case branches, each finishing with 936a constructor, can have an arbitrarily large discount. This led to 937terrible code bloat: see #6099. 938 939Note [Unboxed tuple size and result discount] 940~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 941However, unboxed tuples count as size zero. I found occasions where we had 942 f x y z = case op# x y z of { s -> (# s, () #) } 943and f wasn't getting inlined. 944 945I tried giving unboxed tuples a *result discount* of zero (see the 946commented-out line). Why? When returned as a result they do not 947allocate, so maybe we don't want to charge so much for them If you 948have a non-zero discount here, we find that workers often get inlined 949back into wrappers, because it look like 950 f x = case $wf x of (# a,b #) -> (a,b) 951and we are keener because of the case. However while this change 952shrank binary sizes by 0.5% it also made spectral/boyer allocate 5% 953more. All other changes were very small. So it's not a big deal but I 954didn't adopt the idea. 955 956Note [Function and non-function discounts] 957~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 958We want a discount if the function is applied. A good example is 959monadic combinators with continuation arguments, where inlining is 960quite important. 961 962But we don't want a big discount when a function is called many times 963(see the detailed comments with #6048) because if the function is 964big it won't be inlined at its many call sites and no benefit results. 965Indeed, we can get exponentially big inlinings this way; that is what 966#6048 is about. 967 968On the other hand, for data-valued arguments, if there are lots of 969case expressions in the body, each one will get smaller if we apply 970the function to a constructor application, so we *want* a big discount 971if the argument is scrutinised by many case expressions. 972 973Conclusion: 974 - For functions, take the max of the discounts 975 - For data values, take the sum of the discounts 976 977 978Note [Literal integer size] 979~~~~~~~~~~~~~~~~~~~~~~~~~~~ 980Literal integers *can* be big (mkInteger [...coefficients...]), but 981need not be (S# n). We just use an arbitrary big-ish constant here 982so that, in particular, we don't inline top-level defns like 983 n = S# 5 984There's no point in doing so -- any optimisations will see the S# 985through n's unfolding. Nor will a big size inhibit unfoldings functions 986that mention a literal Integer, because the float-out pass will float 987all those constants to top level. 988-} 989 990primOpSize :: PrimOp -> Int -> ExprSize 991primOpSize op n_val_args 992 = if primOpOutOfLine op 993 then sizeN (op_size + n_val_args) 994 else sizeN op_size 995 where 996 op_size = primOpCodeSize op 997 998 999buildSize :: ExprSize 1000buildSize = SizeIs 0 emptyBag 40 1001 -- We really want to inline applications of build 1002 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later) 1003 -- Indeed, we should add a result_discount because build is 1004 -- very like a constructor. We don't bother to check that the 1005 -- build is saturated (it usually is). The "-2" discounts for the \c n, 1006 -- The "4" is rather arbitrary. 1007 1008augmentSize :: ExprSize 1009augmentSize = SizeIs 0 emptyBag 40 1010 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of 1011 -- e plus ys. The -2 accounts for the \cn 1012 1013-- When we return a lambda, give a discount if it's used (applied) 1014lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize 1015lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (ufFunAppDiscount dflags) 1016lamScrutDiscount _ TooBig = TooBig 1017 1018{- 1019Note [addAltSize result discounts] 1020~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1021When adding the size of alternatives, we *add* the result discounts 1022too, rather than take the *maximum*. For a multi-branch case, this 1023gives a discount for each branch that returns a constructor, making us 1024keener to inline. I did try using 'max' instead, but it makes nofib 1025'rewrite' and 'puzzle' allocate significantly more, and didn't make 1026binary sizes shrink significantly either. 1027 1028Note [Discounts and thresholds] 1029~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1030Constants for discounts and thesholds are defined in main/DynFlags, 1031all of form ufXxxx. They are: 1032 1033ufCreationThreshold 1034 At a definition site, if the unfolding is bigger than this, we 1035 may discard it altogether 1036 1037ufUseThreshold 1038 At a call site, if the unfolding, less discounts, is smaller than 1039 this, then it's small enough inline 1040 1041ufKeenessFactor 1042 Factor by which the discounts are multiplied before 1043 subtracting from size 1044 1045ufDictDiscount 1046 The discount for each occurrence of a dictionary argument 1047 as an argument of a class method. Should be pretty small 1048 else big functions may get inlined 1049 1050ufFunAppDiscount 1051 Discount for a function argument that is applied. Quite 1052 large, because if we inline we avoid the higher-order call. 1053 1054ufDearOp 1055 The size of a foreign call or not-dupable PrimOp 1056 1057ufVeryAggressive 1058 If True, the compiler ignores all the thresholds and inlines very 1059 aggressively. It still adheres to arity, simplifier phase control and 1060 loop breakers. 1061 1062 1063Note [Function applications] 1064~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1065In a function application (f a b) 1066 1067 - If 'f' is an argument to the function being analysed, 1068 and there's at least one value arg, record a FunAppDiscount for f 1069 1070 - If the application if a PAP (arity > 2 in this example) 1071 record a *result* discount (because inlining 1072 with "extra" args in the call may mean that we now 1073 get a saturated application) 1074 1075Code for manipulating sizes 1076-} 1077 1078-- | The size of a candidate expression for unfolding 1079data ExprSize 1080 = TooBig 1081 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found 1082 , _es_args :: !(Bag (Id,Int)) 1083 -- ^ Arguments cased herein, and discount for each such 1084 , _es_discount :: {-# UNPACK #-} !Int 1085 -- ^ Size to subtract if result is scrutinised by a case 1086 -- expression 1087 } 1088 1089instance Outputable ExprSize where 1090 ppr TooBig = text "TooBig" 1091 ppr (SizeIs a _ c) = brackets (int a <+> int c) 1092 1093-- subtract the discount before deciding whether to bale out. eg. we 1094-- want to inline a large constructor application into a selector: 1095-- tup = (a_1, ..., a_99) 1096-- x = case tup of ... 1097-- 1098mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize 1099mkSizeIs max n xs d | (n - d) > max = TooBig 1100 | otherwise = SizeIs n xs d 1101 1102maxSize :: ExprSize -> ExprSize -> ExprSize 1103maxSize TooBig _ = TooBig 1104maxSize _ TooBig = TooBig 1105maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1 1106 | otherwise = s2 1107 1108sizeZero :: ExprSize 1109sizeN :: Int -> ExprSize 1110 1111sizeZero = SizeIs 0 emptyBag 0 1112sizeN n = SizeIs n emptyBag 0 1113 1114{- 1115************************************************************************ 1116* * 1117\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding} 1118* * 1119************************************************************************ 1120 1121We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that 1122we ``couldn't possibly use'' on the other side. Can be overridden w/ 1123flaggery. Just the same as smallEnoughToInline, except that it has no 1124actual arguments. 1125-} 1126 1127couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool 1128couldBeSmallEnoughToInline dflags threshold rhs 1129 = case sizeExpr dflags threshold [] body of 1130 TooBig -> False 1131 _ -> True 1132 where 1133 (_, body) = collectBinders rhs 1134 1135---------------- 1136smallEnoughToInline :: DynFlags -> Unfolding -> Bool 1137smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}}) 1138 = size <= ufUseThreshold dflags 1139smallEnoughToInline _ _ 1140 = False 1141 1142---------------- 1143 1144certainlyWillInline :: DynFlags -> IdInfo -> Maybe Unfolding 1145-- ^ Sees if the unfolding is pretty certain to inline. 1146-- If so, return a *stable* unfolding for it, that will always inline. 1147certainlyWillInline dflags fn_info 1148 = case unfoldingInfo fn_info of 1149 CoreUnfolding { uf_tmpl = e, uf_guidance = g } 1150 | loop_breaker -> Nothing -- Won't inline, so try w/w 1151 | noinline -> Nothing -- See Note [Worker-wrapper for NOINLINE functions] 1152 | otherwise -> do_cunf e g -- Depends on size, so look at that 1153 1154 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense 1155 -- to do so, and even if it is currently a 1156 -- loop breaker, it may not be later 1157 1158 _other_unf -> Nothing 1159 1160 where 1161 loop_breaker = isStrongLoopBreaker (occInfo fn_info) 1162 noinline = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline 1163 fn_unf = unfoldingInfo fn_info 1164 1165 do_cunf :: CoreExpr -> UnfoldingGuidance -> Maybe Unfolding 1166 do_cunf _ UnfNever = Nothing 1167 do_cunf _ (UnfWhen {}) = Just (fn_unf { uf_src = InlineStable }) 1168 -- INLINE functions have UnfWhen 1169 1170 -- The UnfIfGoodArgs case seems important. If we w/w small functions 1171 -- binary sizes go up by 10%! (This is with SplitObjs.) 1172 -- I'm not totally sure why. 1173 -- INLINABLE functions come via this path 1174 -- See Note [certainlyWillInline: INLINABLE] 1175 do_cunf expr (UnfIfGoodArgs { ug_size = size, ug_args = args }) 1176 | arityInfo fn_info > 0 -- See Note [certainlyWillInline: be careful of thunks] 1177 , not (isBottomingSig (strictnessInfo fn_info)) 1178 -- Do not unconditionally inline a bottoming functions even if 1179 -- it seems smallish. We've carefully lifted it out to top level, 1180 -- so we don't want to re-inline it. 1181 , let unf_arity = length args 1182 , size - (10 * (unf_arity + 1)) <= ufUseThreshold dflags 1183 = Just (fn_unf { uf_src = InlineStable 1184 , uf_guidance = UnfWhen { ug_arity = unf_arity 1185 , ug_unsat_ok = unSaturatedOk 1186 , ug_boring_ok = inlineBoringOk expr } }) 1187 -- Note the "unsaturatedOk". A function like f = \ab. a 1188 -- will certainly inline, even if partially applied (f e), so we'd 1189 -- better make sure that the transformed inlining has the same property 1190 | otherwise 1191 = Nothing 1192 1193{- Note [certainlyWillInline: be careful of thunks] 1194~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1195Don't claim that thunks will certainly inline, because that risks work 1196duplication. Even if the work duplication is not great (eg is_cheap 1197holds), it can make a big difference in an inner loop In #5623 we 1198found that the WorkWrap phase thought that 1199 y = case x of F# v -> F# (v +# v) 1200was certainlyWillInline, so the addition got duplicated. 1201 1202Note that we check arityInfo instead of the arity of the unfolding to detect 1203this case. This is so that we don't accidentally fail to inline small partial 1204applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2 1205(say). Here there is no risk of work duplication, and the RHS is tiny, so 1206certainlyWillInline should return True. But `unf_arity` is zero! However f's 1207arity, gotten from `arityInfo fn_info`, is 1. 1208 1209Failing to say that `f` will inline forces W/W to generate a potentially huge 1210worker for f that will immediately cancel with `g`'s wrapper anyway, causing 1211unnecessary churn in the Simplifier while arriving at the same result. 1212 1213Note [certainlyWillInline: INLINABLE] 1214~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1215certainlyWillInline /must/ return Nothing for a large INLINABLE thing, 1216even though we have a stable inlining, so that strictness w/w takes 1217place. It makes a big difference to efficiency, and the w/w pass knows 1218how to transfer the INLINABLE info to the worker; see WorkWrap 1219Note [Worker-wrapper for INLINABLE functions] 1220 1221************************************************************************ 1222* * 1223\subsection{callSiteInline} 1224* * 1225************************************************************************ 1226 1227This is the key function. It decides whether to inline a variable at a call site 1228 1229callSiteInline is used at call sites, so it is a bit more generous. 1230It's a very important function that embodies lots of heuristics. 1231A non-WHNF can be inlined if it doesn't occur inside a lambda, 1232and occurs exactly once or 1233 occurs once in each branch of a case and is small 1234 1235If the thing is in WHNF, there's no danger of duplicating work, 1236so we can inline if it occurs once, or is small 1237 1238NOTE: we don't want to inline top-level functions that always diverge. 1239It just makes the code bigger. Tt turns out that the convenient way to prevent 1240them inlining is to give them a NOINLINE pragma, which we do in 1241StrictAnal.addStrictnessInfoToTopId 1242-} 1243 1244callSiteInline :: DynFlags 1245 -> Id -- The Id 1246 -> Bool -- True <=> unfolding is active 1247 -> Bool -- True if there are no arguments at all (incl type args) 1248 -> [ArgSummary] -- One for each value arg; True if it is interesting 1249 -> CallCtxt -- True <=> continuation is interesting 1250 -> Maybe CoreExpr -- Unfolding, if any 1251 1252data ArgSummary = TrivArg -- Nothing interesting 1253 | NonTrivArg -- Arg has structure 1254 | ValueArg -- Arg is a con-app or PAP 1255 -- ..or con-like. Note [Conlike is interesting] 1256 1257instance Outputable ArgSummary where 1258 ppr TrivArg = text "TrivArg" 1259 ppr NonTrivArg = text "NonTrivArg" 1260 ppr ValueArg = text "ValueArg" 1261 1262nonTriv :: ArgSummary -> Bool 1263nonTriv TrivArg = False 1264nonTriv _ = True 1265 1266data CallCtxt 1267 = BoringCtxt 1268 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets] 1269 | DiscArgCtxt -- Argument of a function with non-zero arg discount 1270 | RuleArgCtxt -- We are somewhere in the argument of a function with rules 1271 1272 | ValAppCtxt -- We're applied to at least one value arg 1273 -- This arises when we have ((f x |> co) y) 1274 -- Then the (f x) has argument 'x' but in a ValAppCtxt 1275 1276 | CaseCtxt -- We're the scrutinee of a case 1277 -- that decomposes its scrutinee 1278 1279instance Outputable CallCtxt where 1280 ppr CaseCtxt = text "CaseCtxt" 1281 ppr ValAppCtxt = text "ValAppCtxt" 1282 ppr BoringCtxt = text "BoringCtxt" 1283 ppr RhsCtxt = text "RhsCtxt" 1284 ppr DiscArgCtxt = text "DiscArgCtxt" 1285 ppr RuleArgCtxt = text "RuleArgCtxt" 1286 1287callSiteInline dflags id active_unfolding lone_variable arg_infos cont_info 1288 = case idUnfolding id of 1289 -- idUnfolding checks for loop-breakers, returning NoUnfolding 1290 -- Things with an INLINE pragma may have an unfolding *and* 1291 -- be a loop breaker (maybe the knot is not yet untied) 1292 CoreUnfolding { uf_tmpl = unf_template 1293 , uf_is_work_free = is_wf 1294 , uf_guidance = guidance, uf_expandable = is_exp } 1295 | active_unfolding -> tryUnfolding dflags id lone_variable 1296 arg_infos cont_info unf_template 1297 is_wf is_exp guidance 1298 | otherwise -> traceInline dflags id "Inactive unfolding:" (ppr id) Nothing 1299 NoUnfolding -> Nothing 1300 BootUnfolding -> Nothing 1301 OtherCon {} -> Nothing 1302 DFunUnfolding {} -> Nothing -- Never unfold a DFun 1303 1304traceInline :: DynFlags -> Id -> String -> SDoc -> a -> a 1305traceInline dflags inline_id str doc result 1306 | Just prefix <- inlineCheck dflags 1307 = if prefix `isPrefixOf` occNameString (getOccName inline_id) 1308 then pprTrace str doc result 1309 else result 1310 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags 1311 = pprTrace str doc result 1312 | otherwise 1313 = result 1314 1315-- | This is an awful but temporary workaround for #17615, where the 1316-- case analysis from the 'ufVeryAggressive' selector causes the entire 1317-- 'DynFlags' to be unpacked into local bindings (due to binder swap). This 1318-- results in a tremendous amount of stack spillage, severely bloating the code 1319-- generated for 'callSiteInline'. 1320-- 1321-- The right solution here is likely to fix binder swap to avoid this terrible 1322-- behavior (since there are likely other instances of this as well) but this 1323-- case was serious enough that it showed up in a CPU profile and consequently 1324-- I wanted to fix it for 8.10. 1325very_aggressive :: DynFlags -> Bool 1326very_aggressive = ufVeryAggressive 1327{-# NOINLINE very_aggressive #-} 1328 1329tryUnfolding :: DynFlags -> Id -> Bool -> [ArgSummary] -> CallCtxt 1330 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance 1331 -> Maybe CoreExpr 1332tryUnfolding dflags id lone_variable 1333 arg_infos cont_info unf_template 1334 is_wf is_exp guidance 1335 = case guidance of 1336 UnfNever -> traceInline dflags id str (text "UnfNever") Nothing 1337 1338 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok } 1339 | enough_args && (boring_ok || some_benefit || very_aggressive dflags) 1340 -- See Note [INLINE for small functions (3)] 1341 -> traceInline dflags id str (mk_doc some_benefit empty True) (Just unf_template) 1342 | otherwise 1343 -> traceInline dflags id str (mk_doc some_benefit empty False) Nothing 1344 where 1345 some_benefit = calc_some_benefit uf_arity 1346 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0) 1347 1348 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size } 1349 | very_aggressive dflags 1350 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template) 1351 | is_wf && some_benefit && small_enough 1352 -> traceInline dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template) 1353 | otherwise 1354 -> traceInline dflags id str (mk_doc some_benefit extra_doc False) Nothing 1355 where 1356 some_benefit = calc_some_benefit (length arg_discounts) 1357 extra_doc = text "discounted size =" <+> int discounted_size 1358 discounted_size = size - discount 1359 small_enough = discounted_size <= ufUseThreshold dflags 1360 discount = computeDiscount dflags arg_discounts 1361 res_discount arg_infos cont_info 1362 1363 where 1364 mk_doc some_benefit extra_doc yes_or_no 1365 = vcat [ text "arg infos" <+> ppr arg_infos 1366 , text "interesting continuation" <+> ppr cont_info 1367 , text "some_benefit" <+> ppr some_benefit 1368 , text "is exp:" <+> ppr is_exp 1369 , text "is work-free:" <+> ppr is_wf 1370 , text "guidance" <+> ppr guidance 1371 , extra_doc 1372 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"] 1373 1374 str = "Considering inlining: " ++ showSDocDump dflags (ppr id) 1375 n_val_args = length arg_infos 1376 1377 -- some_benefit is used when the RHS is small enough 1378 -- and the call has enough (or too many) value 1379 -- arguments (ie n_val_args >= arity). But there must 1380 -- be *something* interesting about some argument, or the 1381 -- result context, to make it worth inlining 1382 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args 1383 -- expected by the unfolding 1384 calc_some_benefit uf_arity 1385 | not saturated = interesting_args -- Under-saturated 1386 -- Note [Unsaturated applications] 1387 | otherwise = interesting_args -- Saturated or over-saturated 1388 || interesting_call 1389 where 1390 saturated = n_val_args >= uf_arity 1391 over_saturated = n_val_args > uf_arity 1392 interesting_args = any nonTriv arg_infos 1393 -- NB: (any nonTriv arg_infos) looks at the 1394 -- over-saturated args too which is "wrong"; 1395 -- but if over-saturated we inline anyway. 1396 1397 interesting_call 1398 | over_saturated 1399 = True 1400 | otherwise 1401 = case cont_info of 1402 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables] 1403 ValAppCtxt -> True -- Note [Cast then apply] 1404 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts] 1405 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt] 1406 RhsCtxt -> uf_arity > 0 -- 1407 _other -> False -- See Note [Nested functions] 1408 1409 1410{- 1411Note [Unfold into lazy contexts], Note [RHS of lets] 1412~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1413When the call is the argument of a function with a RULE, or the RHS of a let, 1414we are a little bit keener to inline. For example 1415 f y = (y,y,y) 1416 g y = let x = f y in ...(case x of (a,b,c) -> ...) ... 1417We'd inline 'f' if the call was in a case context, and it kind-of-is, 1418only we can't see it. Also 1419 x = f v 1420could be expensive whereas 1421 x = case v of (a,b) -> a 1422is patently cheap and may allow more eta expansion. 1423So we treat the RHS of a let as not-totally-boring. 1424 1425Note [Unsaturated applications] 1426~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1427When a call is not saturated, we *still* inline if one of the 1428arguments has interesting structure. That's sometimes very important. 1429A good example is the Ord instance for Bool in Base: 1430 1431 Rec { 1432 $fOrdBool =GHC.Classes.D:Ord 1433 @ Bool 1434 ... 1435 $cmin_ajX 1436 1437 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool 1438 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool 1439 } 1440 1441But the defn of GHC.Classes.$dmmin is: 1442 1443 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a 1444 {- Arity: 3, HasNoCafRefs, Strictness: SLL, 1445 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a -> 1446 case @ a GHC.Classes.<= @ a $dOrd x y of wild { 1447 GHC.Types.False -> y GHC.Types.True -> x }) -} 1448 1449We *really* want to inline $dmmin, even though it has arity 3, in 1450order to unravel the recursion. 1451 1452 1453Note [Things to watch] 1454~~~~~~~~~~~~~~~~~~~~~~ 1455* { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... } 1456 Assume x is exported, so not inlined unconditionally. 1457 Then we want x to inline unconditionally; no reason for it 1458 not to, and doing so avoids an indirection. 1459 1460* { x = I# 3; ....f x.... } 1461 Make sure that x does not inline unconditionally! 1462 Lest we get extra allocation. 1463 1464Note [Inlining an InlineRule] 1465~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1466An InlineRules is used for 1467 (a) programmer INLINE pragmas 1468 (b) inlinings from worker/wrapper 1469 1470For (a) the RHS may be large, and our contract is that we *only* inline 1471when the function is applied to all the arguments on the LHS of the 1472source-code defn. (The uf_arity in the rule.) 1473 1474However for worker/wrapper it may be worth inlining even if the 1475arity is not satisfied (as we do in the CoreUnfolding case) so we don't 1476require saturation. 1477 1478Note [Nested functions] 1479~~~~~~~~~~~~~~~~~~~~~~~ 1480At one time we treated a call of a non-top-level function as 1481"interesting" (regardless of how boring the context) in the hope 1482that inlining it would eliminate the binding, and its allocation. 1483Specifically, in the default case of interesting_call we had 1484 _other -> not is_top && uf_arity > 0 1485 1486But actually postInlineUnconditionally does some of this and overall 1487it makes virtually no difference to nofib. So I simplified away this 1488special case 1489 1490Note [Cast then apply] 1491~~~~~~~~~~~~~~~~~~~~~~ 1492Consider 1493 myIndex = __inline_me ( (/\a. <blah>) |> co ) 1494 co :: (forall a. a -> a) ~ (forall a. T a) 1495 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ... 1496 1497We need to inline myIndex to unravel this; but the actual call (myIndex a) has 1498no value arguments. The ValAppCtxt gives it enough incentive to inline. 1499 1500Note [Inlining in ArgCtxt] 1501~~~~~~~~~~~~~~~~~~~~~~~~~~ 1502The condition (arity > 0) here is very important, because otherwise 1503we end up inlining top-level stuff into useless places; eg 1504 x = I# 3# 1505 f = \y. g x 1506This can make a very big difference: it adds 16% to nofib 'integer' allocs, 1507and 20% to 'power'. 1508 1509At one stage I replaced this condition by 'True' (leading to the above 1510slow-down). The motivation was test eyeball/inline1.hs; but that seems 1511to work ok now. 1512 1513NOTE: arguably, we should inline in ArgCtxt only if the result of the 1514call is at least CONLIKE. At least for the cases where we use ArgCtxt 1515for the RHS of a 'let', we only profit from the inlining if we get a 1516CONLIKE thing (modulo lets). 1517 1518Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables] 1519~~~~~~~~~~~~~~~~~~~~~ which appears below 1520The "lone-variable" case is important. I spent ages messing about 1521with unsatisfactory variants, but this is nice. The idea is that if a 1522variable appears all alone 1523 1524 as an arg of lazy fn, or rhs BoringCtxt 1525 as scrutinee of a case CaseCtxt 1526 as arg of a fn ArgCtxt 1527AND 1528 it is bound to a cheap expression 1529 1530then we should not inline it (unless there is some other reason, 1531e.g. it is the sole occurrence). That is what is happening at 1532the use of 'lone_variable' in 'interesting_call'. 1533 1534Why? At least in the case-scrutinee situation, turning 1535 let x = (a,b) in case x of y -> ... 1536into 1537 let x = (a,b) in case (a,b) of y -> ... 1538and thence to 1539 let x = (a,b) in let y = (a,b) in ... 1540is bad if the binding for x will remain. 1541 1542Another example: I discovered that strings 1543were getting inlined straight back into applications of 'error' 1544because the latter is strict. 1545 s = "foo" 1546 f = \x -> ...(error s)... 1547 1548Fundamentally such contexts should not encourage inlining because, provided 1549the RHS is "expandable" (see Note [exprIsExpandable] in CoreUtils) the 1550context can ``see'' the unfolding of the variable (e.g. case or a 1551RULE) so there's no gain. 1552 1553However, watch out: 1554 1555 * Consider this: 1556 foo = _inline_ (\n. [n]) 1557 bar = _inline_ (foo 20) 1558 baz = \n. case bar of { (m:_) -> m + n } 1559 Here we really want to inline 'bar' so that we can inline 'foo' 1560 and the whole thing unravels as it should obviously do. This is 1561 important: in the NDP project, 'bar' generates a closure data 1562 structure rather than a list. 1563 1564 So the non-inlining of lone_variables should only apply if the 1565 unfolding is regarded as cheap; because that is when exprIsConApp_maybe 1566 looks through the unfolding. Hence the "&& is_wf" in the 1567 InlineRule branch. 1568 1569 * Even a type application or coercion isn't a lone variable. 1570 Consider 1571 case $fMonadST @ RealWorld of { :DMonad a b c -> c } 1572 We had better inline that sucker! The case won't see through it. 1573 1574 For now, I'm treating treating a variable applied to types 1575 in a *lazy* context "lone". The motivating example was 1576 f = /\a. \x. BIG 1577 g = /\a. \y. h (f a) 1578 There's no advantage in inlining f here, and perhaps 1579 a significant disadvantage. Hence some_val_args in the Stop case 1580 1581Note [Interaction of exprIsWorkFree and lone variables] 1582~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1583The lone-variable test says "don't inline if a case expression 1584scrutinises a lone variable whose unfolding is cheap". It's very 1585important that, under these circumstances, exprIsConApp_maybe 1586can spot a constructor application. So, for example, we don't 1587consider 1588 let x = e in (x,x) 1589to be cheap, and that's good because exprIsConApp_maybe doesn't 1590think that expression is a constructor application. 1591 1592In the 'not (lone_variable && is_wf)' test, I used to test is_value 1593rather than is_wf, which was utterly wrong, because the above 1594expression responds True to exprIsHNF, which is what sets is_value. 1595 1596This kind of thing can occur if you have 1597 1598 {-# INLINE foo #-} 1599 foo = let x = e in (x,x) 1600 1601which Roman did. 1602 1603 1604-} 1605 1606computeDiscount :: DynFlags -> [Int] -> Int -> [ArgSummary] -> CallCtxt 1607 -> Int 1608computeDiscount dflags arg_discounts res_discount arg_infos cont_info 1609 -- We multiple the raw discounts (args_discount and result_discount) 1610 -- ty opt_UnfoldingKeenessFactor because the former have to do with 1611 -- *size* whereas the discounts imply that there's some extra 1612 -- *efficiency* to be gained (e.g. beta reductions, case reductions) 1613 -- by inlining. 1614 1615 = 10 -- Discount of 10 because the result replaces the call 1616 -- so we count 10 for the function itself 1617 1618 + 10 * length actual_arg_discounts 1619 -- Discount of 10 for each arg supplied, 1620 -- because the result replaces the call 1621 1622 + round (ufKeenessFactor dflags * 1623 fromIntegral (total_arg_discount + res_discount')) 1624 where 1625 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos 1626 total_arg_discount = sum actual_arg_discounts 1627 1628 mk_arg_discount _ TrivArg = 0 1629 mk_arg_discount _ NonTrivArg = 10 1630 mk_arg_discount discount ValueArg = discount 1631 1632 res_discount' 1633 | LT <- arg_discounts `compareLength` arg_infos 1634 = res_discount -- Over-saturated 1635 | otherwise 1636 = case cont_info of 1637 BoringCtxt -> 0 1638 CaseCtxt -> res_discount -- Presumably a constructor 1639 ValAppCtxt -> res_discount -- Presumably a function 1640 _ -> 40 `min` res_discount 1641 -- ToDo: this 40 `min` res_discount doesn't seem right 1642 -- for DiscArgCtxt it shouldn't matter because the function will 1643 -- get the arg discount for any non-triv arg 1644 -- for RuleArgCtxt we do want to be keener to inline; but not only 1645 -- constructor results 1646 -- for RhsCtxt I suppose that exposing a data con is good in general 1647 -- And 40 seems very arbitrary 1648 -- 1649 -- res_discount can be very large when a function returns 1650 -- constructors; but we only want to invoke that large discount 1651 -- when there's a case continuation. 1652 -- Otherwise we, rather arbitrarily, threshold it. Yuk. 1653 -- But we want to aovid inlining large functions that return 1654 -- constructors into contexts that are simply "interesting" 1655