1{- 2(c) The University of Glasgow 2006 3(c) The AQUA Project, Glasgow University, 1994-1998 4 5 6Core-syntax unfoldings 7 8Unfoldings (which can travel across module boundaries) are in Core 9syntax (namely @CoreExpr@s). 10 11The type @Unfolding@ sits ``above'' simply-Core-expressions 12unfoldings, capturing ``higher-level'' things we know about a binding, 13usually things that the simplifier found out (e.g., ``it's a 14literal''). In the corner of a @CoreUnfolding@ unfolding, you will 15find, unsurprisingly, a Core expression. 16-} 17 18{-# LANGUAGE CPP #-} 19{-# LANGUAGE BangPatterns #-} 20 21{-# OPTIONS_GHC -Wno-incomplete-record-updates #-} 22 23module GHC.Core.Unfold ( 24 Unfolding, UnfoldingGuidance, -- Abstract types 25 26 UnfoldingOpts (..), defaultUnfoldingOpts, 27 updateCreationThreshold, updateUseThreshold, 28 updateFunAppDiscount, updateDictDiscount, 29 updateVeryAggressive, updateCaseScaling, updateCaseThreshold, 30 31 ArgSummary(..), 32 33 couldBeSmallEnoughToInline, inlineBoringOk, 34 certainlyWillInline, smallEnoughToInline, 35 36 callSiteInline, CallCtxt(..), 37 calcUnfoldingGuidance 38 ) where 39 40#include "GhclibHsVersions.h" 41 42import GHC.Prelude 43 44import GHC.Driver.Session 45import GHC.Driver.Ppr 46import GHC.Core 47import GHC.Core.Utils 48import GHC.Types.Id 49import GHC.Types.Demand ( isDeadEndSig ) 50import GHC.Core.DataCon 51import GHC.Types.Literal 52import GHC.Builtin.PrimOps 53import GHC.Types.Id.Info 54import GHC.Types.Basic ( Arity, InlineSpec(..), inlinePragmaSpec ) 55import GHC.Core.Type 56import GHC.Builtin.Names 57import GHC.Builtin.Types.Prim ( realWorldStatePrimTy ) 58import GHC.Data.Bag 59import GHC.Utils.Logger 60import GHC.Utils.Misc 61import GHC.Utils.Outputable 62import GHC.Types.ForeignCall 63import GHC.Types.Name 64import GHC.Types.Tickish 65 66import qualified Data.ByteString as BS 67import Data.List (isPrefixOf) 68 69 70-- | Unfolding options 71data UnfoldingOpts = UnfoldingOpts 72 { unfoldingCreationThreshold :: !Int 73 -- ^ Threshold above which unfoldings are not *created* 74 75 , unfoldingUseThreshold :: !Int 76 -- ^ Threshold above which unfoldings are not *inlined* 77 78 , unfoldingFunAppDiscount :: !Int 79 -- ^ Discount for lambdas that are used (applied) 80 81 , unfoldingDictDiscount :: !Int 82 -- ^ Discount for dictionaries 83 84 , unfoldingVeryAggressive :: !Bool 85 -- ^ Force inlining in many more cases 86 87 -- Don't consider depth up to x 88 , unfoldingCaseThreshold :: !Int 89 90 -- Penalize depth with 1/x 91 , unfoldingCaseScaling :: !Int 92 } 93 94defaultUnfoldingOpts :: UnfoldingOpts 95defaultUnfoldingOpts = UnfoldingOpts 96 { unfoldingCreationThreshold = 750 97 -- The unfoldingCreationThreshold threshold must be reasonably high 98 -- to take account of possible discounts. 99 -- E.g. 450 is not enough in 'fulsom' for Interval.sqr to 100 -- inline into Csg.calc (The unfolding for sqr never makes it 101 -- into the interface file.) 102 103 , unfoldingUseThreshold = 90 104 -- Last adjusted upwards in #18282, when I reduced 105 -- the result discount for constructors. 106 107 , unfoldingFunAppDiscount = 60 108 -- Be fairly keen to inline a function if that means 109 -- we'll be able to pick the right method from a dictionary 110 111 , unfoldingDictDiscount = 30 112 -- Be fairly keen to inline a function if that means 113 -- we'll be able to pick the right method from a dictionary 114 115 , unfoldingVeryAggressive = False 116 117 -- Only apply scaling once we are deeper than threshold cases 118 -- in an RHS. 119 , unfoldingCaseThreshold = 2 120 121 -- Penalize depth with (size*depth)/scaling 122 , unfoldingCaseScaling = 30 123 } 124 125-- Helpers for "GHC.Driver.Session" 126 127updateCreationThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts 128updateCreationThreshold n opts = opts { unfoldingCreationThreshold = n } 129 130updateUseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts 131updateUseThreshold n opts = opts { unfoldingUseThreshold = n } 132 133updateFunAppDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts 134updateFunAppDiscount n opts = opts { unfoldingFunAppDiscount = n } 135 136updateDictDiscount :: Int -> UnfoldingOpts -> UnfoldingOpts 137updateDictDiscount n opts = opts { unfoldingDictDiscount = n } 138 139updateVeryAggressive :: Bool -> UnfoldingOpts -> UnfoldingOpts 140updateVeryAggressive n opts = opts { unfoldingVeryAggressive = n } 141 142 143updateCaseThreshold :: Int -> UnfoldingOpts -> UnfoldingOpts 144updateCaseThreshold n opts = opts { unfoldingCaseThreshold = n } 145 146updateCaseScaling :: Int -> UnfoldingOpts -> UnfoldingOpts 147updateCaseScaling n opts = opts { unfoldingCaseScaling = n } 148 149{- 150Note [Occurrence analysis of unfoldings] 151~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 152We do occurrence-analysis of unfoldings once and for all, when the 153unfolding is built, rather than each time we inline them. 154 155But given this decision it's vital that we do 156*always* do it. Consider this unfolding 157 \x -> letrec { f = ...g...; g* = f } in body 158where g* is (for some strange reason) the loop breaker. If we don't 159occ-anal it when reading it in, we won't mark g as a loop breaker, and 160we may inline g entirely in body, dropping its binding, and leaving 161the occurrence in f out of scope. This happened in #8892, where 162the unfolding in question was a DFun unfolding. 163 164But more generally, the simplifier is designed on the 165basis that it is looking at occurrence-analysed expressions, so better 166ensure that they actually are. 167 168Note [Calculate unfolding guidance on the non-occ-anal'd expression] 169~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 170Notice that we give the non-occur-analysed expression to 171calcUnfoldingGuidance. In some ways it'd be better to occur-analyse 172first; for example, sometimes during simplification, there's a large 173let-bound thing which has been substituted, and so is now dead; so 174'expr' contains two copies of the thing while the occurrence-analysed 175expression doesn't. 176 177Nevertheless, we *don't* and *must not* occ-analyse before computing 178the size because 179 180a) The size computation bales out after a while, whereas occurrence 181 analysis does not. 182 183b) Residency increases sharply if you occ-anal first. I'm not 184 100% sure why, but it's a large effect. Compiling Cabal went 185 from residency of 534M to over 800M with this one change. 186 187This can occasionally mean that the guidance is very pessimistic; 188it gets fixed up next round. And it should be rare, because large 189let-bound things that are dead are usually caught by preInlineUnconditionally 190 191 192************************************************************************ 193* * 194\subsection{The UnfoldingGuidance type} 195* * 196************************************************************************ 197-} 198 199inlineBoringOk :: CoreExpr -> Bool 200-- See Note [INLINE for small functions] 201-- True => the result of inlining the expression is 202-- no bigger than the expression itself 203-- eg (\x y -> f y x) 204-- This is a quick and dirty version. It doesn't attempt 205-- to deal with (\x y z -> x (y z)) 206-- The really important one is (x `cast` c) 207inlineBoringOk e 208 = go 0 e 209 where 210 go :: Int -> CoreExpr -> Bool 211 go credit (Lam x e) | isId x = go (credit+1) e 212 | otherwise = go credit e 213 -- See Note [Count coercion arguments in boring contexts] 214 go credit (App f (Type {})) = go credit f 215 go credit (App f a) | credit > 0 216 , exprIsTrivial a = go (credit-1) f 217 go credit (Tick _ e) = go credit e -- dubious 218 go credit (Cast e _) = go credit e 219 go credit (Case scrut _ _ [Alt _ _ rhs]) -- See Note [Inline unsafeCoerce] 220 | isUnsafeEqualityProof scrut = go credit rhs 221 go _ (Var {}) = boringCxtOk 222 go _ _ = boringCxtNotOk 223 224calcUnfoldingGuidance 225 :: UnfoldingOpts 226 -> Bool -- Definitely a top-level, bottoming binding 227 -> CoreExpr -- Expression to look at 228 -> UnfoldingGuidance 229calcUnfoldingGuidance opts is_top_bottoming (Tick t expr) 230 | not (tickishIsCode t) -- non-code ticks don't matter for unfolding 231 = calcUnfoldingGuidance opts is_top_bottoming expr 232calcUnfoldingGuidance opts is_top_bottoming expr 233 = case sizeExpr opts bOMB_OUT_SIZE val_bndrs body of 234 TooBig -> UnfNever 235 SizeIs size cased_bndrs scrut_discount 236 | uncondInline expr n_val_bndrs size 237 -> UnfWhen { ug_unsat_ok = unSaturatedOk 238 , ug_boring_ok = boringCxtOk 239 , ug_arity = n_val_bndrs } -- Note [INLINE for small functions] 240 241 | is_top_bottoming 242 -> UnfNever -- See Note [Do not inline top-level bottoming functions] 243 244 | otherwise 245 -> UnfIfGoodArgs { ug_args = map (mk_discount cased_bndrs) val_bndrs 246 , ug_size = size 247 , ug_res = scrut_discount } 248 249 where 250 (bndrs, body) = collectBinders expr 251 bOMB_OUT_SIZE = unfoldingCreationThreshold opts 252 -- Bomb out if size gets bigger than this 253 val_bndrs = filter isId bndrs 254 n_val_bndrs = length val_bndrs 255 256 mk_discount :: Bag (Id,Int) -> Id -> Int 257 mk_discount cbs bndr = foldl' combine 0 cbs 258 where 259 combine acc (bndr', disc) 260 | bndr == bndr' = acc `plus_disc` disc 261 | otherwise = acc 262 263 plus_disc :: Int -> Int -> Int 264 plus_disc | isFunTy (idType bndr) = max 265 | otherwise = (+) 266 -- See Note [Function and non-function discounts] 267 268{- Note [Inline unsafeCoerce] 269~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 270We really want to inline unsafeCoerce, even when applied to boring 271arguments. It doesn't look as if its RHS is smaller than the call 272 unsafeCoerce x = case unsafeEqualityProof @a @b of UnsafeRefl -> x 273but that case is discarded -- see Note [Implementing unsafeCoerce] 274in base:Unsafe.Coerce. 275 276Moreover, if we /don't/ inline it, we may be left with 277 f (unsafeCoerce x) 278which will build a thunk -- bad, bad, bad. 279 280Conclusion: we really want inlineBoringOk to be True of the RHS of 281unsafeCoerce. This is (U4) in Note [Implementing unsafeCoerce]. 282 283Note [Computing the size of an expression] 284~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 285The basic idea of sizeExpr is obvious enough: count nodes. But getting the 286heuristics right has taken a long time. Here's the basic strategy: 287 288 * Variables, literals: 0 289 (Exception for string literals, see litSize.) 290 291 * Function applications (f e1 .. en): 1 + #value args 292 293 * Constructor applications: 1, regardless of #args 294 295 * Let(rec): 1 + size of components 296 297 * Note, cast: 0 298 299Examples 300 301 Size Term 302 -------------- 303 0 42# 304 0 x 305 0 True 306 2 f x 307 1 Just x 308 4 f (g x) 309 310Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's 311a function call to account for. Notice also that constructor applications 312are very cheap, because exposing them to a caller is so valuable. 313 314[25/5/11] All sizes are now multiplied by 10, except for primops 315(which have sizes like 1 or 4. This makes primops look fantastically 316cheap, and seems to be almost universally beneficial. Done partly as a 317result of #4978. 318 319Note [Do not inline top-level bottoming functions] 320~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 321The FloatOut pass has gone to some trouble to float out calls to 'error' 322and similar friends. See Note [Bottoming floats] in GHC.Core.Opt.SetLevels. 323Do not re-inline them! But we *do* still inline if they are very small 324(the uncondInline stuff). 325 326Note [INLINE for small functions] 327~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 328Consider {-# INLINE f #-} 329 f x = Just x 330 g y = f y 331Then f's RHS is no larger than its LHS, so we should inline it into 332even the most boring context. In general, f the function is 333sufficiently small that its body is as small as the call itself, the 334inline unconditionally, regardless of how boring the context is. 335 336Things to note: 337 338(1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr) 339 than the thing it's replacing. Notice that 340 (f x) --> (g 3) -- YES, unconditionally 341 (f x) --> x : [] -- YES, *even though* there are two 342 -- arguments to the cons 343 x --> g 3 -- NO 344 x --> Just v -- NO 345 346 It's very important not to unconditionally replace a variable by 347 a non-atomic term. 348 349(2) We do this even if the thing isn't saturated, else we end up with the 350 silly situation that 351 f x y = x 352 ...map (f 3)... 353 doesn't inline. Even in a boring context, inlining without being 354 saturated will give a lambda instead of a PAP, and will be more 355 efficient at runtime. 356 357(3) However, when the function's arity > 0, we do insist that it 358 has at least one value argument at the call site. (This check is 359 made in the UnfWhen case of callSiteInline.) Otherwise we find this: 360 f = /\a \x:a. x 361 d = /\b. MkD (f b) 362 If we inline f here we get 363 d = /\b. MkD (\x:b. x) 364 and then prepareRhs floats out the argument, abstracting the type 365 variables, so we end up with the original again! 366 367(4) We must be much more cautious about arity-zero things. Consider 368 let x = y +# z in ... 369 In *size* terms primops look very small, because the generate a 370 single instruction, but we do not want to unconditionally replace 371 every occurrence of x with (y +# z). So we only do the 372 unconditional-inline thing for *trivial* expressions. 373 374 NB: you might think that PostInlineUnconditionally would do this 375 but it doesn't fire for top-level things; see GHC.Core.Opt.Simplify.Utils 376 Note [Top level and postInlineUnconditionally] 377 378Note [Count coercion arguments in boring contexts] 379~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 380In inlineBoringOK, we ignore type arguments when deciding whether an 381expression is okay to inline into boring contexts. This is good, since 382if we have a definition like 383 384 let y = x @Int in f y y 385 386there’s no reason not to inline y at both use sites — no work is 387actually duplicated. It may seem like the same reasoning applies to 388coercion arguments, and indeed, in #17182 we changed inlineBoringOK to 389treat coercions the same way. 390 391However, this isn’t a good idea: unlike type arguments, which have 392no runtime representation, coercion arguments *do* have a runtime 393representation (albeit the zero-width VoidRep, see Note [Coercion tokens] 394in "GHC.CoreToStg"). This caused trouble in #17787 for DataCon wrappers for 395nullary GADT constructors: the wrappers would be inlined and each use of 396the constructor would lead to a separate allocation instead of just 397sharing the wrapper closure. 398 399The solution: don’t ignore coercion arguments after all. 400-} 401 402uncondInline :: CoreExpr -> Arity -> Int -> Bool 403-- Inline unconditionally if there no size increase 404-- Size of call is arity (+1 for the function) 405-- See Note [INLINE for small functions] 406uncondInline rhs arity size 407 | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1) 408 | otherwise = exprIsTrivial rhs -- See Note [INLINE for small functions] (4) 409 410sizeExpr :: UnfoldingOpts 411 -> Int -- Bomb out if it gets bigger than this 412 -> [Id] -- Arguments; we're interested in which of these 413 -- get case'd 414 -> CoreExpr 415 -> ExprSize 416 417-- Note [Computing the size of an expression] 418 419-- Forcing bOMB_OUT_SIZE early prevents repeated 420-- unboxing of the Int argument. 421sizeExpr opts !bOMB_OUT_SIZE top_args expr 422 = size_up expr 423 where 424 size_up (Cast e _) = size_up e 425 size_up (Tick _ e) = size_up e 426 size_up (Type _) = sizeZero -- Types cost nothing 427 size_up (Coercion _) = sizeZero 428 size_up (Lit lit) = sizeN (litSize lit) 429 size_up (Var f) | isRealWorldId f = sizeZero 430 -- Make sure we get constructor discounts even 431 -- on nullary constructors 432 | otherwise = size_up_call f [] 0 433 434 size_up (App fun arg) 435 | isTyCoArg arg = size_up fun 436 | otherwise = size_up arg `addSizeNSD` 437 size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0) 438 439 size_up (Lam b e) 440 | isId b && not (isRealWorldId b) = lamScrutDiscount opts (size_up e `addSizeN` 10) 441 | otherwise = size_up e 442 443 size_up (Let (NonRec binder rhs) body) 444 = size_up_rhs (binder, rhs) `addSizeNSD` 445 size_up body `addSizeN` 446 size_up_alloc binder 447 448 size_up (Let (Rec pairs) body) 449 = foldr (addSizeNSD . size_up_rhs) 450 (size_up body `addSizeN` sum (map (size_up_alloc . fst) pairs)) 451 pairs 452 453 size_up (Case e _ _ alts) 454 | null alts 455 = size_up e -- case e of {} never returns, so take size of scrutinee 456 457 size_up (Case e _ _ alts) 458 -- Now alts is non-empty 459 | Just v <- is_top_arg e -- We are scrutinising an argument variable 460 = let 461 alt_sizes = map size_up_alt alts 462 463 -- alts_size tries to compute a good discount for 464 -- the case when we are scrutinising an argument variable 465 alts_size (SizeIs tot tot_disc tot_scrut) 466 -- Size of all alternatives 467 (SizeIs max _ _) 468 -- Size of biggest alternative 469 = SizeIs tot (unitBag (v, 20 + tot - max) 470 `unionBags` tot_disc) tot_scrut 471 -- If the variable is known, we produce a 472 -- discount that will take us back to 'max', 473 -- the size of the largest alternative The 474 -- 1+ is a little discount for reduced 475 -- allocation in the caller 476 -- 477 -- Notice though, that we return tot_disc, 478 -- the total discount from all branches. I 479 -- think that's right. 480 481 alts_size tot_size _ = tot_size 482 in 483 alts_size (foldr1 addAltSize alt_sizes) -- alts is non-empty 484 (foldr1 maxSize alt_sizes) 485 -- Good to inline if an arg is scrutinised, because 486 -- that may eliminate allocation in the caller 487 -- And it eliminates the case itself 488 where 489 is_top_arg (Var v) | v `elem` top_args = Just v 490 is_top_arg (Cast e _) = is_top_arg e 491 is_top_arg _ = Nothing 492 493 494 size_up (Case e _ _ alts) = size_up e `addSizeNSD` 495 foldr (addAltSize . size_up_alt) case_size alts 496 where 497 case_size 498 | is_inline_scrut e, lengthAtMost alts 1 = sizeN (-10) 499 | otherwise = sizeZero 500 -- Normally we don't charge for the case itself, but 501 -- we charge one per alternative (see size_up_alt, 502 -- below) to account for the cost of the info table 503 -- and comparisons. 504 -- 505 -- However, in certain cases (see is_inline_scrut 506 -- below), no code is generated for the case unless 507 -- there are multiple alts. In these cases we 508 -- subtract one, making the first alt free. 509 -- e.g. case x# +# y# of _ -> ... should cost 1 510 -- case touch# x# of _ -> ... should cost 0 511 -- (see #4978) 512 -- 513 -- I would like to not have the "lengthAtMost alts 1" 514 -- condition above, but without that some programs got worse 515 -- (spectral/hartel/event and spectral/para). I don't fully 516 -- understand why. (SDM 24/5/11) 517 518 -- unboxed variables, inline primops and unsafe foreign calls 519 -- are all "inline" things: 520 is_inline_scrut (Var v) = isUnliftedType (idType v) 521 is_inline_scrut scrut 522 | (Var f, _) <- collectArgs scrut 523 = case idDetails f of 524 FCallId fc -> not (isSafeForeignCall fc) 525 PrimOpId op -> not (primOpOutOfLine op) 526 _other -> False 527 | otherwise 528 = False 529 530 size_up_rhs (bndr, rhs) 531 | Just join_arity <- isJoinId_maybe bndr 532 -- Skip arguments to join point 533 , (_bndrs, body) <- collectNBinders join_arity rhs 534 = size_up body 535 | otherwise 536 = size_up rhs 537 538 ------------ 539 -- size_up_app is used when there's ONE OR MORE value args 540 size_up_app (App fun arg) args voids 541 | isTyCoArg arg = size_up_app fun args voids 542 | isRealWorldExpr arg = size_up_app fun (arg:args) (voids + 1) 543 | otherwise = size_up arg `addSizeNSD` 544 size_up_app fun (arg:args) voids 545 size_up_app (Var fun) args voids = size_up_call fun args voids 546 size_up_app (Tick _ expr) args voids = size_up_app expr args voids 547 size_up_app (Cast expr _) args voids = size_up_app expr args voids 548 size_up_app other args voids = size_up other `addSizeN` 549 callSize (length args) voids 550 -- if the lhs is not an App or a Var, or an invisible thing like a 551 -- Tick or Cast, then we should charge for a complete call plus the 552 -- size of the lhs itself. 553 554 ------------ 555 size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize 556 size_up_call fun val_args voids 557 = case idDetails fun of 558 FCallId _ -> sizeN (callSize (length val_args) voids) 559 DataConWorkId dc -> conSize dc (length val_args) 560 PrimOpId op -> primOpSize op (length val_args) 561 ClassOpId _ -> classOpSize opts top_args val_args 562 _ -> funSize opts top_args fun (length val_args) voids 563 564 ------------ 565 size_up_alt (Alt _con _bndrs rhs) = size_up rhs `addSizeN` 10 566 -- Don't charge for args, so that wrappers look cheap 567 -- (See comments about wrappers with Case) 568 -- 569 -- IMPORTANT: *do* charge 1 for the alternative, else we 570 -- find that giant case nests are treated as practically free 571 -- A good example is Foreign.C.Error.errnoToIOError 572 573 ------------ 574 -- Cost to allocate binding with given binder 575 size_up_alloc bndr 576 | isTyVar bndr -- Doesn't exist at runtime 577 || isJoinId bndr -- Not allocated at all 578 || isUnliftedType (idType bndr) -- Doesn't live in heap 579 = 0 580 | otherwise 581 = 10 582 583 ------------ 584 -- These addSize things have to be here because 585 -- I don't want to give them bOMB_OUT_SIZE as an argument 586 addSizeN TooBig _ = TooBig 587 addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n + m) xs d 588 589 -- addAltSize is used to add the sizes of case alternatives 590 addAltSize TooBig _ = TooBig 591 addAltSize _ TooBig = TooBig 592 addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2) 593 = mkSizeIs bOMB_OUT_SIZE (n1 + n2) 594 (xs `unionBags` ys) 595 (d1 + d2) -- Note [addAltSize result discounts] 596 597 -- This variant ignores the result discount from its LEFT argument 598 -- It's used when the second argument isn't part of the result 599 addSizeNSD TooBig _ = TooBig 600 addSizeNSD _ TooBig = TooBig 601 addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2) 602 = mkSizeIs bOMB_OUT_SIZE (n1 + n2) 603 (xs `unionBags` ys) 604 d2 -- Ignore d1 605 606 isRealWorldId id = idType id `eqType` realWorldStatePrimTy 607 608 -- an expression of type State# RealWorld must be a variable 609 isRealWorldExpr (Var id) = isRealWorldId id 610 isRealWorldExpr (Tick _ e) = isRealWorldExpr e 611 isRealWorldExpr _ = False 612 613-- | Finds a nominal size of a string literal. 614litSize :: Literal -> Int 615-- Used by GHC.Core.Unfold.sizeExpr 616litSize (LitNumber LitNumInteger _) = 100 -- Note [Size of literal integers] 617litSize (LitNumber LitNumNatural _) = 100 618litSize (LitString str) = 10 + 10 * ((BS.length str + 3) `div` 4) 619 -- If size could be 0 then @f "x"@ might be too small 620 -- [Sept03: make literal strings a bit bigger to avoid fruitless 621 -- duplication of little strings] 622litSize _other = 0 -- Must match size of nullary constructors 623 -- Key point: if x |-> 4, then x must inline unconditionally 624 -- (eg via case binding) 625 626classOpSize :: UnfoldingOpts -> [Id] -> [CoreExpr] -> ExprSize 627-- See Note [Conlike is interesting] 628classOpSize _ _ [] 629 = sizeZero 630classOpSize opts top_args (arg1 : other_args) 631 = SizeIs size arg_discount 0 632 where 633 size = 20 + (10 * length other_args) 634 -- If the class op is scrutinising a lambda bound dictionary then 635 -- give it a discount, to encourage the inlining of this function 636 -- The actual discount is rather arbitrarily chosen 637 arg_discount = case arg1 of 638 Var dict | dict `elem` top_args 639 -> unitBag (dict, unfoldingDictDiscount opts) 640 _other -> emptyBag 641 642-- | The size of a function call 643callSize 644 :: Int -- ^ number of value args 645 -> Int -- ^ number of value args that are void 646 -> Int 647callSize n_val_args voids = 10 * (1 + n_val_args - voids) 648 -- The 1+ is for the function itself 649 -- Add 1 for each non-trivial arg; 650 -- the allocation cost, as in let(rec) 651 652-- | The size of a jump to a join point 653jumpSize 654 :: Int -- ^ number of value args 655 -> Int -- ^ number of value args that are void 656 -> Int 657jumpSize n_val_args voids = 2 * (1 + n_val_args - voids) 658 -- A jump is 20% the size of a function call. Making jumps free reopens 659 -- bug #6048, but making them any more expensive loses a 21% improvement in 660 -- spectral/puzzle. TODO Perhaps adjusting the default threshold would be a 661 -- better solution? 662 663funSize :: UnfoldingOpts -> [Id] -> Id -> Int -> Int -> ExprSize 664-- Size for functions that are not constructors or primops 665-- Note [Function applications] 666funSize opts top_args fun n_val_args voids 667 | fun `hasKey` buildIdKey = buildSize 668 | fun `hasKey` augmentIdKey = augmentSize 669 | otherwise = SizeIs size arg_discount res_discount 670 where 671 some_val_args = n_val_args > 0 672 is_join = isJoinId fun 673 674 size | is_join = jumpSize n_val_args voids 675 | not some_val_args = 0 676 | otherwise = callSize n_val_args voids 677 678 -- DISCOUNTS 679 -- See Note [Function and non-function discounts] 680 arg_discount | some_val_args && fun `elem` top_args 681 = unitBag (fun, unfoldingFunAppDiscount opts) 682 | otherwise = emptyBag 683 -- If the function is an argument and is applied 684 -- to some values, give it an arg-discount 685 686 res_discount | idArity fun > n_val_args = unfoldingFunAppDiscount opts 687 | otherwise = 0 688 -- If the function is partially applied, show a result discount 689-- XXX maybe behave like ConSize for eval'd variable 690 691conSize :: DataCon -> Int -> ExprSize 692conSize dc n_val_args 693 | n_val_args == 0 = SizeIs 0 emptyBag 10 -- Like variables 694 695-- See Note [Unboxed tuple size and result discount] 696 | isUnboxedTupleDataCon dc = SizeIs 0 emptyBag 10 697 698-- See Note [Constructor size and result discount] 699 | otherwise = SizeIs 10 emptyBag 10 700 701{- Note [Constructor size and result discount] 702~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 703Treat a constructors application as size 10, regardless of how many 704arguments it has; we are keen to expose them (and we charge separately 705for their args). We can't treat them as size zero, else we find that 706(Just x) has size 0, which is the same as a lone variable; and hence 707'v' will always be replaced by (Just x), where v is bound to Just x. 708 709The "result discount" is applied if the result of the call is 710scrutinised (say by a case). For a constructor application that will 711mean the constructor application will disappear, so we don't need to 712charge it to the function. So the discount should at least match the 713cost of the constructor application, namely 10. 714 715Historical note 1: Until Jun 2020 we gave it a "bit of extra 716incentive" via a discount of 10*(1 + n_val_args), but that was FAR too 717much (#18282). In particular, consider a huge case tree like 718 719 let r = case y1 of 720 Nothing -> B1 a b c 721 Just v1 -> case y2 of 722 Nothing -> B1 c b a 723 Just v2 -> ... 724 725If conSize gives a cost of 10 (regardless of n_val_args) and a 726discount of 10, that'll make each alternative RHS cost zero. We 727charge 10 for each case alternative (see size_up_alt). If we give a 728bigger discount (say 20) in conSize, we'll make the case expression 729cost *nothing*, and that can make a huge case tree cost nothing. This 730leads to massive, sometimes exponenial inlinings (#18282). In short, 731don't give a discount that give a negative size to a sub-expression! 732 733Historical note 2: Much longer ago, Simon M tried a MUCH bigger 734discount: (10 * (10 + n_val_args)), and said it was an "unambiguous 735win", but its terribly dangerous because a function with many many 736case branches, each finishing with a constructor, can have an 737arbitrarily large discount. This led to terrible code bloat: see #6099. 738 739Note [Unboxed tuple size and result discount] 740~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 741However, unboxed tuples count as size zero. I found occasions where we had 742 f x y z = case op# x y z of { s -> (# s, () #) } 743and f wasn't getting inlined. 744 745I tried giving unboxed tuples a *result discount* of zero (see the 746commented-out line). Why? When returned as a result they do not 747allocate, so maybe we don't want to charge so much for them. If you 748have a non-zero discount here, we find that workers often get inlined 749back into wrappers, because it look like 750 f x = case $wf x of (# a,b #) -> (a,b) 751and we are keener because of the case. However while this change 752shrank binary sizes by 0.5% it also made spectral/boyer allocate 5% 753more. All other changes were very small. So it's not a big deal but I 754didn't adopt the idea. 755 756When fixing #18282 (see Note [Constructor size and result discount]) 757I changed the result discount to be just 10, not 10*(1+n_val_args). 758 759Note [Function and non-function discounts] 760~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 761We want a discount if the function is applied. A good example is 762monadic combinators with continuation arguments, where inlining is 763quite important. 764 765But we don't want a big discount when a function is called many times 766(see the detailed comments with #6048) because if the function is 767big it won't be inlined at its many call sites and no benefit results. 768Indeed, we can get exponentially big inlinings this way; that is what 769#6048 is about. 770 771On the other hand, for data-valued arguments, if there are lots of 772case expressions in the body, each one will get smaller if we apply 773the function to a constructor application, so we *want* a big discount 774if the argument is scrutinised by many case expressions. 775 776Conclusion: 777 - For functions, take the max of the discounts 778 - For data values, take the sum of the discounts 779 780 781Note [Literal integer size] 782~~~~~~~~~~~~~~~~~~~~~~~~~~~ 783Literal integers *can* be big (mkInteger [...coefficients...]), but 784need not be (IS n). We just use an arbitrary big-ish constant here 785so that, in particular, we don't inline top-level defns like 786 n = IS 5 787There's no point in doing so -- any optimisations will see the IS 788through n's unfolding. Nor will a big size inhibit unfoldings functions 789that mention a literal Integer, because the float-out pass will float 790all those constants to top level. 791-} 792 793primOpSize :: PrimOp -> Int -> ExprSize 794primOpSize op n_val_args 795 = if primOpOutOfLine op 796 then sizeN (op_size + n_val_args) 797 else sizeN op_size 798 where 799 op_size = primOpCodeSize op 800 801 802buildSize :: ExprSize 803buildSize = SizeIs 0 emptyBag 40 804 -- We really want to inline applications of build 805 -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later) 806 -- Indeed, we should add a result_discount because build is 807 -- very like a constructor. We don't bother to check that the 808 -- build is saturated (it usually is). The "-2" discounts for the \c n, 809 -- The "4" is rather arbitrary. 810 811augmentSize :: ExprSize 812augmentSize = SizeIs 0 emptyBag 40 813 -- Ditto (augment t (\cn -> e) ys) should cost only the cost of 814 -- e plus ys. The -2 accounts for the \cn 815 816-- When we return a lambda, give a discount if it's used (applied) 817lamScrutDiscount :: UnfoldingOpts -> ExprSize -> ExprSize 818lamScrutDiscount opts (SizeIs n vs _) = SizeIs n vs (unfoldingFunAppDiscount opts) 819lamScrutDiscount _ TooBig = TooBig 820 821{- 822Note [addAltSize result discounts] 823~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 824When adding the size of alternatives, we *add* the result discounts 825too, rather than take the *maximum*. For a multi-branch case, this 826gives a discount for each branch that returns a constructor, making us 827keener to inline. I did try using 'max' instead, but it makes nofib 828'rewrite' and 'puzzle' allocate significantly more, and didn't make 829binary sizes shrink significantly either. 830 831Note [Discounts and thresholds] 832~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 833 834Constants for discounts and thresholds are defined in 'UnfoldingOpts'. They are: 835 836unfoldingCreationThreshold 837 At a definition site, if the unfolding is bigger than this, we 838 may discard it altogether 839 840unfoldingUseThreshold 841 At a call site, if the unfolding, less discounts, is smaller than 842 this, then it's small enough inline 843 844unfoldingDictDiscount 845 The discount for each occurrence of a dictionary argument 846 as an argument of a class method. Should be pretty small 847 else big functions may get inlined 848 849unfoldingFunAppDiscount 850 Discount for a function argument that is applied. Quite 851 large, because if we inline we avoid the higher-order call. 852 853unfoldingVeryAggressive 854 If True, the compiler ignores all the thresholds and inlines very 855 aggressively. It still adheres to arity, simplifier phase control and 856 loop breakers. 857 858 859Historical Note: Before April 2020 we had another factor, 860ufKeenessFactor, which would scale the discounts before they were subtracted 861from the size. This was justified with the following comment: 862 863 -- We multiply the raw discounts (args_discount and result_discount) 864 -- ty opt_UnfoldingKeenessFactor because the former have to do with 865 -- *size* whereas the discounts imply that there's some extra 866 -- *efficiency* to be gained (e.g. beta reductions, case reductions) 867 -- by inlining. 868 869However, this is highly suspect since it means that we subtract a *scaled* size 870from an absolute size, resulting in crazy (e.g. negative) scores in some cases 871(#15304). We consequently killed off ufKeenessFactor and bumped up the 872ufUseThreshold to compensate. 873 874 875Note [Function applications] 876~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 877In a function application (f a b) 878 879 - If 'f' is an argument to the function being analysed, 880 and there's at least one value arg, record a FunAppDiscount for f 881 882 - If the application if a PAP (arity > 2 in this example) 883 record a *result* discount (because inlining 884 with "extra" args in the call may mean that we now 885 get a saturated application) 886 887Code for manipulating sizes 888-} 889 890-- | The size of a candidate expression for unfolding 891data ExprSize 892 = TooBig 893 | SizeIs { _es_size_is :: {-# UNPACK #-} !Int -- ^ Size found 894 , _es_args :: !(Bag (Id,Int)) 895 -- ^ Arguments cased herein, and discount for each such 896 , _es_discount :: {-# UNPACK #-} !Int 897 -- ^ Size to subtract if result is scrutinised by a case 898 -- expression 899 } 900 901instance Outputable ExprSize where 902 ppr TooBig = text "TooBig" 903 ppr (SizeIs a _ c) = brackets (int a <+> int c) 904 905-- subtract the discount before deciding whether to bale out. eg. we 906-- want to inline a large constructor application into a selector: 907-- tup = (a_1, ..., a_99) 908-- x = case tup of ... 909-- 910mkSizeIs :: Int -> Int -> Bag (Id, Int) -> Int -> ExprSize 911mkSizeIs max n xs d | (n - d) > max = TooBig 912 | otherwise = SizeIs n xs d 913 914maxSize :: ExprSize -> ExprSize -> ExprSize 915maxSize TooBig _ = TooBig 916maxSize _ TooBig = TooBig 917maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 > n2 = s1 918 | otherwise = s2 919 920sizeZero :: ExprSize 921sizeN :: Int -> ExprSize 922 923sizeZero = SizeIs 0 emptyBag 0 924sizeN n = SizeIs n emptyBag 0 925 926{- 927************************************************************************ 928* * 929\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding} 930* * 931************************************************************************ 932 933We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that 934we ``couldn't possibly use'' on the other side. Can be overridden w/ 935flaggery. Just the same as smallEnoughToInline, except that it has no 936actual arguments. 937-} 938 939couldBeSmallEnoughToInline :: UnfoldingOpts -> Int -> CoreExpr -> Bool 940couldBeSmallEnoughToInline opts threshold rhs 941 = case sizeExpr opts threshold [] body of 942 TooBig -> False 943 _ -> True 944 where 945 (_, body) = collectBinders rhs 946 947---------------- 948smallEnoughToInline :: UnfoldingOpts -> Unfolding -> Bool 949smallEnoughToInline opts (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}}) 950 = size <= unfoldingUseThreshold opts 951smallEnoughToInline _ _ 952 = False 953 954---------------- 955 956certainlyWillInline :: UnfoldingOpts -> IdInfo -> Maybe Unfolding 957-- ^ Sees if the unfolding is pretty certain to inline. 958-- If so, return a *stable* unfolding for it, that will always inline. 959certainlyWillInline opts fn_info 960 = case fn_unf of 961 CoreUnfolding { uf_tmpl = expr, uf_guidance = guidance, uf_src = src } 962 | loop_breaker -> Nothing -- Won't inline, so try w/w 963 | noinline -> Nothing -- See Note [Worker-wrapper for NOINLINE functions] 964 | otherwise 965 -> case guidance of 966 UnfNever -> Nothing 967 UnfWhen {} -> Just (fn_unf { uf_src = src' }) 968 -- INLINE functions have UnfWhen 969 UnfIfGoodArgs { ug_size = size, ug_args = args } 970 -> do_cunf expr size args src' 971 where 972 src' = case src of 973 InlineRhs -> InlineStable 974 _ -> src -- Do not change InlineCompulsory! 975 976 DFunUnfolding {} -> Just fn_unf -- Don't w/w DFuns; it never makes sense 977 -- to do so, and even if it is currently a 978 -- loop breaker, it may not be later 979 980 _other_unf -> Nothing 981 982 where 983 loop_breaker = isStrongLoopBreaker (occInfo fn_info) 984 noinline = inlinePragmaSpec (inlinePragInfo fn_info) == NoInline 985 fn_unf = unfoldingInfo fn_info 986 987 -- The UnfIfGoodArgs case seems important. If we w/w small functions 988 -- binary sizes go up by 10%! (This is with SplitObjs.) 989 -- I'm not totally sure why. 990 -- INLINABLE functions come via this path 991 -- See Note [certainlyWillInline: INLINABLE] 992 do_cunf expr size args src' 993 | arityInfo fn_info > 0 -- See Note [certainlyWillInline: be careful of thunks] 994 , not (isDeadEndSig (strictnessInfo fn_info)) 995 -- Do not unconditionally inline a bottoming functions even if 996 -- it seems smallish. We've carefully lifted it out to top level, 997 -- so we don't want to re-inline it. 998 , let unf_arity = length args 999 , size - (10 * (unf_arity + 1)) <= unfoldingUseThreshold opts 1000 = Just (fn_unf { uf_src = src' 1001 , uf_guidance = UnfWhen { ug_arity = unf_arity 1002 , ug_unsat_ok = unSaturatedOk 1003 , ug_boring_ok = inlineBoringOk expr } }) 1004 -- Note the "unsaturatedOk". A function like f = \ab. a 1005 -- will certainly inline, even if partially applied (f e), so we'd 1006 -- better make sure that the transformed inlining has the same property 1007 | otherwise 1008 = Nothing 1009 1010{- Note [certainlyWillInline: be careful of thunks] 1011~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1012Don't claim that thunks will certainly inline, because that risks work 1013duplication. Even if the work duplication is not great (eg is_cheap 1014holds), it can make a big difference in an inner loop In #5623 we 1015found that the WorkWrap phase thought that 1016 y = case x of F# v -> F# (v +# v) 1017was certainlyWillInline, so the addition got duplicated. 1018 1019Note that we check arityInfo instead of the arity of the unfolding to detect 1020this case. This is so that we don't accidentally fail to inline small partial 1021applications, like `f = g 42` (where `g` recurses into `f`) where g has arity 2 1022(say). Here there is no risk of work duplication, and the RHS is tiny, so 1023certainlyWillInline should return True. But `unf_arity` is zero! However f's 1024arity, gotten from `arityInfo fn_info`, is 1. 1025 1026Failing to say that `f` will inline forces W/W to generate a potentially huge 1027worker for f that will immediately cancel with `g`'s wrapper anyway, causing 1028unnecessary churn in the Simplifier while arriving at the same result. 1029 1030Note [certainlyWillInline: INLINABLE] 1031~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1032certainlyWillInline /must/ return Nothing for a large INLINABLE thing, 1033even though we have a stable inlining, so that strictness w/w takes 1034place. It makes a big difference to efficiency, and the w/w pass knows 1035how to transfer the INLINABLE info to the worker; see WorkWrap 1036Note [Worker-wrapper for INLINABLE functions] 1037 1038************************************************************************ 1039* * 1040\subsection{callSiteInline} 1041* * 1042************************************************************************ 1043 1044This is the key function. It decides whether to inline a variable at a call site 1045 1046callSiteInline is used at call sites, so it is a bit more generous. 1047It's a very important function that embodies lots of heuristics. 1048A non-WHNF can be inlined if it doesn't occur inside a lambda, 1049and occurs exactly once or 1050 occurs once in each branch of a case and is small 1051 1052If the thing is in WHNF, there's no danger of duplicating work, 1053so we can inline if it occurs once, or is small 1054 1055NOTE: we don't want to inline top-level functions that always diverge. 1056It just makes the code bigger. Tt turns out that the convenient way to prevent 1057them inlining is to give them a NOINLINE pragma, which we do in 1058StrictAnal.addStrictnessInfoToTopId 1059-} 1060 1061callSiteInline :: Logger 1062 -> DynFlags 1063 -> Int -- Case depth 1064 -> Id -- The Id 1065 -> Bool -- True <=> unfolding is active 1066 -> Bool -- True if there are no arguments at all (incl type args) 1067 -> [ArgSummary] -- One for each value arg; True if it is interesting 1068 -> CallCtxt -- True <=> continuation is interesting 1069 -> Maybe CoreExpr -- Unfolding, if any 1070 1071data ArgSummary = TrivArg -- Nothing interesting 1072 | NonTrivArg -- Arg has structure 1073 | ValueArg -- Arg is a con-app or PAP 1074 -- ..or con-like. Note [Conlike is interesting] 1075 1076instance Outputable ArgSummary where 1077 ppr TrivArg = text "TrivArg" 1078 ppr NonTrivArg = text "NonTrivArg" 1079 ppr ValueArg = text "ValueArg" 1080 1081nonTriv :: ArgSummary -> Bool 1082nonTriv TrivArg = False 1083nonTriv _ = True 1084 1085data CallCtxt 1086 = BoringCtxt 1087 | RhsCtxt -- Rhs of a let-binding; see Note [RHS of lets] 1088 | DiscArgCtxt -- Argument of a function with non-zero arg discount 1089 | RuleArgCtxt -- We are somewhere in the argument of a function with rules 1090 1091 | ValAppCtxt -- We're applied to at least one value arg 1092 -- This arises when we have ((f x |> co) y) 1093 -- Then the (f x) has argument 'x' but in a ValAppCtxt 1094 1095 | CaseCtxt -- We're the scrutinee of a case 1096 -- that decomposes its scrutinee 1097 1098instance Outputable CallCtxt where 1099 ppr CaseCtxt = text "CaseCtxt" 1100 ppr ValAppCtxt = text "ValAppCtxt" 1101 ppr BoringCtxt = text "BoringCtxt" 1102 ppr RhsCtxt = text "RhsCtxt" 1103 ppr DiscArgCtxt = text "DiscArgCtxt" 1104 ppr RuleArgCtxt = text "RuleArgCtxt" 1105 1106callSiteInline logger dflags !case_depth id active_unfolding lone_variable arg_infos cont_info 1107 = case idUnfolding id of 1108 -- idUnfolding checks for loop-breakers, returning NoUnfolding 1109 -- Things with an INLINE pragma may have an unfolding *and* 1110 -- be a loop breaker (maybe the knot is not yet untied) 1111 CoreUnfolding { uf_tmpl = unf_template 1112 , uf_is_work_free = is_wf 1113 , uf_guidance = guidance, uf_expandable = is_exp } 1114 | active_unfolding -> tryUnfolding logger dflags case_depth id lone_variable 1115 arg_infos cont_info unf_template 1116 is_wf is_exp guidance 1117 | otherwise -> traceInline logger dflags id "Inactive unfolding:" (ppr id) Nothing 1118 NoUnfolding -> Nothing 1119 BootUnfolding -> Nothing 1120 OtherCon {} -> Nothing 1121 DFunUnfolding {} -> Nothing -- Never unfold a DFun 1122 1123-- | Report the inlining of an identifier's RHS to the user, if requested. 1124traceInline :: Logger -> DynFlags -> Id -> String -> SDoc -> a -> a 1125traceInline logger dflags inline_id str doc result 1126 -- We take care to ensure that doc is used in only one branch, ensuring that 1127 -- the simplifier can push its allocation into the branch. See Note [INLINE 1128 -- conditional tracing utilities]. 1129 | enable = putTraceMsg logger dflags str doc result 1130 | otherwise = result 1131 where 1132 enable 1133 | dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags 1134 = True 1135 | Just prefix <- inlineCheck dflags 1136 = prefix `isPrefixOf` occNameString (getOccName inline_id) 1137 | otherwise 1138 = False 1139{-# INLINE traceInline #-} -- see Note [INLINE conditional tracing utilities] 1140 1141{- Note [Avoid inlining into deeply nested cases] 1142 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1143 1144Consider a function f like this: 1145 1146 f arg1 arg2 = 1147 case ... 1148 ... -> g arg1 1149 ... -> g arg2 1150 1151This function is small. So should be safe to inline. 1152However sometimes this doesn't quite work out like that. 1153Consider this code: 1154 1155f1 arg1 arg2 ... = ... 1156 case _foo of 1157 alt1 -> ... f2 arg1 ... 1158 alt2 -> ... f2 arg2 ... 1159 1160f2 arg1 arg2 ... = ... 1161 case _foo of 1162 alt1 -> ... f3 arg1 ... 1163 alt2 -> ... f3 arg2 ... 1164 1165f3 arg1 arg2 ... = ... 1166 1167... repeats up to n times. And then f1 is 1168applied to some arguments: 1169 1170foo = ... f1 <interestingArgs> ... 1171 1172Initially f2..fn are not interesting to inline so we don't. 1173However we see that f1 is applied to interesting args. 1174So it's an obvious choice to inline those: 1175 1176foo = 1177 ... 1178 case _foo of 1179 alt1 -> ... f2 <interestingArg> ... 1180 alt2 -> ... f2 <interestingArg> ... 1181 1182As a result we go and inline f2 both mentions of f2 in turn are now applied to interesting 1183arguments and f2 is small: 1184 1185foo = 1186 ... 1187 case _foo of 1188 alt1 -> ... case _foo of 1189 alt1 -> ... f3 <interestingArg> ... 1190 alt2 -> ... f3 <interestingArg> ... 1191 1192 alt2 -> ... case _foo of 1193 alt1 -> ... f3 <interestingArg> ... 1194 alt2 -> ... f3 <interestingArg> ... 1195 1196The same thing happens for each binding up to f_n, duplicating the amount of inlining 1197done in each step. Until at some point we are either done or run out of simplifier 1198ticks/RAM. This pattern happened #18730. 1199 1200To combat this we introduce one more heuristic when weighing inlining decision. 1201We keep track of a "case-depth". Which increases each time we look inside a case 1202expression with more than one alternative. 1203 1204We then apply a penalty to inlinings based on the case-depth at which they would 1205be inlined. Bounding the number of inlinings in such a scenario. 1206 1207The heuristic can be tuned in two ways: 1208 1209* We can ignore the first n levels of case nestings for inlining decisions using 1210 -funfolding-case-threshold. 1211* The penalty grows linear with the depth. It's computed as size*(depth-threshold)/scaling. 1212 Scaling can be set with -funfolding-case-scaling. 1213 1214Some guidance on setting these defaults: 1215 1216* A low treshold (<= 2) is needed to prevent exponential cases from spiraling out of 1217 control. We picked 2 for no particular reason. 1218* Scaling the penalty by any more than 30 means the reproducer from 1219 T18730 won't compile even with reasonably small values of n. Instead 1220 it will run out of runs/ticks. This means to positively affect the reproducer 1221 a scaling <= 30 is required. 1222* A scaling of >= 15 still causes a few very large regressions on some nofib benchmarks. 1223 (+80% for gc/fulsom, +90% for real/ben-raytrace, +20% for spectral/fibheaps) 1224* A scaling of >= 25 showed no regressions on nofib. However it showed a number of 1225 (small) regression for compiler perf benchmarks. 1226 1227The end result is that we are settling for a scaling of 30, with a threshold of 2. 1228This gives us minimal compiler perf regressions. No nofib runtime regressions and 1229will still avoid this pattern sometimes. This is a "safe" default, where we err on 1230the side of compiler blowup instead of risking runtime regressions. 1231 1232For cases where the default falls short the flag can be changed to allow more/less inlining as 1233needed on a per-module basis. 1234 1235-} 1236 1237tryUnfolding :: Logger -> DynFlags -> Int -> Id -> Bool -> [ArgSummary] -> CallCtxt 1238 -> CoreExpr -> Bool -> Bool -> UnfoldingGuidance 1239 -> Maybe CoreExpr 1240tryUnfolding logger dflags !case_depth id lone_variable 1241 arg_infos cont_info unf_template 1242 is_wf is_exp guidance 1243 = case guidance of 1244 UnfNever -> traceInline logger dflags id str (text "UnfNever") Nothing 1245 1246 UnfWhen { ug_arity = uf_arity, ug_unsat_ok = unsat_ok, ug_boring_ok = boring_ok } 1247 | enough_args && (boring_ok || some_benefit || unfoldingVeryAggressive uf_opts) 1248 -- See Note [INLINE for small functions (3)] 1249 -> traceInline logger dflags id str (mk_doc some_benefit empty True) (Just unf_template) 1250 | otherwise 1251 -> traceInline logger dflags id str (mk_doc some_benefit empty False) Nothing 1252 where 1253 some_benefit = calc_some_benefit uf_arity 1254 enough_args = (n_val_args >= uf_arity) || (unsat_ok && n_val_args > 0) 1255 1256 UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size } 1257 | unfoldingVeryAggressive uf_opts 1258 -> traceInline logger dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template) 1259 | is_wf && some_benefit && small_enough 1260 -> traceInline logger dflags id str (mk_doc some_benefit extra_doc True) (Just unf_template) 1261 | otherwise 1262 -> traceInline logger dflags id str (mk_doc some_benefit extra_doc False) Nothing 1263 where 1264 some_benefit = calc_some_benefit (length arg_discounts) 1265 extra_doc = vcat [ text "case depth =" <+> int case_depth 1266 , text "depth based penalty =" <+> int depth_penalty 1267 , text "discounted size =" <+> int adjusted_size ] 1268 -- See Note [Avoid inlining into deeply nested cases] 1269 depth_treshold = unfoldingCaseThreshold uf_opts 1270 depth_scaling = unfoldingCaseScaling uf_opts 1271 depth_penalty | case_depth <= depth_treshold = 0 1272 | otherwise = (size * (case_depth - depth_treshold)) `div` depth_scaling 1273 adjusted_size = size + depth_penalty - discount 1274 small_enough = adjusted_size <= unfoldingUseThreshold uf_opts 1275 discount = computeDiscount arg_discounts res_discount arg_infos cont_info 1276 1277 where 1278 uf_opts = unfoldingOpts dflags 1279 mk_doc some_benefit extra_doc yes_or_no 1280 = vcat [ text "arg infos" <+> ppr arg_infos 1281 , text "interesting continuation" <+> ppr cont_info 1282 , text "some_benefit" <+> ppr some_benefit 1283 , text "is exp:" <+> ppr is_exp 1284 , text "is work-free:" <+> ppr is_wf 1285 , text "guidance" <+> ppr guidance 1286 , extra_doc 1287 , text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"] 1288 1289 ctx = initSDocContext dflags defaultDumpStyle 1290 str = "Considering inlining: " ++ showSDocDump ctx (ppr id) 1291 n_val_args = length arg_infos 1292 1293 -- some_benefit is used when the RHS is small enough 1294 -- and the call has enough (or too many) value 1295 -- arguments (ie n_val_args >= arity). But there must 1296 -- be *something* interesting about some argument, or the 1297 -- result context, to make it worth inlining 1298 calc_some_benefit :: Arity -> Bool -- The Arity is the number of args 1299 -- expected by the unfolding 1300 calc_some_benefit uf_arity 1301 | not saturated = interesting_args -- Under-saturated 1302 -- Note [Unsaturated applications] 1303 | otherwise = interesting_args -- Saturated or over-saturated 1304 || interesting_call 1305 where 1306 saturated = n_val_args >= uf_arity 1307 over_saturated = n_val_args > uf_arity 1308 interesting_args = any nonTriv arg_infos 1309 -- NB: (any nonTriv arg_infos) looks at the 1310 -- over-saturated args too which is "wrong"; 1311 -- but if over-saturated we inline anyway. 1312 1313 interesting_call 1314 | over_saturated 1315 = True 1316 | otherwise 1317 = case cont_info of 1318 CaseCtxt -> not (lone_variable && is_exp) -- Note [Lone variables] 1319 ValAppCtxt -> True -- Note [Cast then apply] 1320 RuleArgCtxt -> uf_arity > 0 -- See Note [Unfold info lazy contexts] 1321 DiscArgCtxt -> uf_arity > 0 -- Note [Inlining in ArgCtxt] 1322 RhsCtxt -> uf_arity > 0 -- 1323 _other -> False -- See Note [Nested functions] 1324 1325 1326{- 1327Note [Unfold into lazy contexts], Note [RHS of lets] 1328~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1329When the call is the argument of a function with a RULE, or the RHS of a let, 1330we are a little bit keener to inline. For example 1331 f y = (y,y,y) 1332 g y = let x = f y in ...(case x of (a,b,c) -> ...) ... 1333We'd inline 'f' if the call was in a case context, and it kind-of-is, 1334only we can't see it. Also 1335 x = f v 1336could be expensive whereas 1337 x = case v of (a,b) -> a 1338is patently cheap and may allow more eta expansion. 1339So we treat the RHS of a let as not-totally-boring. 1340 1341Note [Unsaturated applications] 1342~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1343When a call is not saturated, we *still* inline if one of the 1344arguments has interesting structure. That's sometimes very important. 1345A good example is the Ord instance for Bool in Base: 1346 1347 Rec { 1348 $fOrdBool =GHC.Classes.D:Ord 1349 @ Bool 1350 ... 1351 $cmin_ajX 1352 1353 $cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool 1354 $cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool 1355 } 1356 1357But the defn of GHC.Classes.$dmmin is: 1358 1359 $dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a 1360 {- Arity: 3, HasNoCafRefs, Strictness: SLL, 1361 Unfolding: (\ @ a $dOrd :: GHC.Classes.Ord a x :: a y :: a -> 1362 case @ a GHC.Classes.<= @ a $dOrd x y of wild { 1363 GHC.Types.False -> y GHC.Types.True -> x }) -} 1364 1365We *really* want to inline $dmmin, even though it has arity 3, in 1366order to unravel the recursion. 1367 1368 1369Note [Things to watch] 1370~~~~~~~~~~~~~~~~~~~~~~ 1371* { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... } 1372 Assume x is exported, so not inlined unconditionally. 1373 Then we want x to inline unconditionally; no reason for it 1374 not to, and doing so avoids an indirection. 1375 1376* { x = I# 3; ....f x.... } 1377 Make sure that x does not inline unconditionally! 1378 Lest we get extra allocation. 1379 1380Note [Inlining an InlineRule] 1381~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1382An InlineRules is used for 1383 (a) programmer INLINE pragmas 1384 (b) inlinings from worker/wrapper 1385 1386For (a) the RHS may be large, and our contract is that we *only* inline 1387when the function is applied to all the arguments on the LHS of the 1388source-code defn. (The uf_arity in the rule.) 1389 1390However for worker/wrapper it may be worth inlining even if the 1391arity is not satisfied (as we do in the CoreUnfolding case) so we don't 1392require saturation. 1393 1394Note [Nested functions] 1395~~~~~~~~~~~~~~~~~~~~~~~ 1396At one time we treated a call of a non-top-level function as 1397"interesting" (regardless of how boring the context) in the hope 1398that inlining it would eliminate the binding, and its allocation. 1399Specifically, in the default case of interesting_call we had 1400 _other -> not is_top && uf_arity > 0 1401 1402But actually postInlineUnconditionally does some of this and overall 1403it makes virtually no difference to nofib. So I simplified away this 1404special case 1405 1406Note [Cast then apply] 1407~~~~~~~~~~~~~~~~~~~~~~ 1408Consider 1409 myIndex = __inline_me ( (/\a. <blah>) |> co ) 1410 co :: (forall a. a -> a) ~ (forall a. T a) 1411 ... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ... 1412 1413We need to inline myIndex to unravel this; but the actual call (myIndex a) has 1414no value arguments. The ValAppCtxt gives it enough incentive to inline. 1415 1416Note [Inlining in ArgCtxt] 1417~~~~~~~~~~~~~~~~~~~~~~~~~~ 1418The condition (arity > 0) here is very important, because otherwise 1419we end up inlining top-level stuff into useless places; eg 1420 x = I# 3# 1421 f = \y. g x 1422This can make a very big difference: it adds 16% to nofib 'integer' allocs, 1423and 20% to 'power'. 1424 1425At one stage I replaced this condition by 'True' (leading to the above 1426slow-down). The motivation was test eyeball/inline1.hs; but that seems 1427to work ok now. 1428 1429NOTE: arguably, we should inline in ArgCtxt only if the result of the 1430call is at least CONLIKE. At least for the cases where we use ArgCtxt 1431for the RHS of a 'let', we only profit from the inlining if we get a 1432CONLIKE thing (modulo lets). 1433 1434Note [Lone variables] See also Note [Interaction of exprIsWorkFree and lone variables] 1435~~~~~~~~~~~~~~~~~~~~~ which appears below 1436The "lone-variable" case is important. I spent ages messing about 1437with unsatisfactory variants, but this is nice. The idea is that if a 1438variable appears all alone 1439 1440 as an arg of lazy fn, or rhs BoringCtxt 1441 as scrutinee of a case CaseCtxt 1442 as arg of a fn ArgCtxt 1443AND 1444 it is bound to a cheap expression 1445 1446then we should not inline it (unless there is some other reason, 1447e.g. it is the sole occurrence). That is what is happening at 1448the use of 'lone_variable' in 'interesting_call'. 1449 1450Why? At least in the case-scrutinee situation, turning 1451 let x = (a,b) in case x of y -> ... 1452into 1453 let x = (a,b) in case (a,b) of y -> ... 1454and thence to 1455 let x = (a,b) in let y = (a,b) in ... 1456is bad if the binding for x will remain. 1457 1458Another example: I discovered that strings 1459were getting inlined straight back into applications of 'error' 1460because the latter is strict. 1461 s = "foo" 1462 f = \x -> ...(error s)... 1463 1464Fundamentally such contexts should not encourage inlining because, provided 1465the RHS is "expandable" (see Note [exprIsExpandable] in GHC.Core.Utils) the 1466context can ``see'' the unfolding of the variable (e.g. case or a 1467RULE) so there's no gain. 1468 1469However, watch out: 1470 1471 * Consider this: 1472 foo = _inline_ (\n. [n]) 1473 bar = _inline_ (foo 20) 1474 baz = \n. case bar of { (m:_) -> m + n } 1475 Here we really want to inline 'bar' so that we can inline 'foo' 1476 and the whole thing unravels as it should obviously do. This is 1477 important: in the NDP project, 'bar' generates a closure data 1478 structure rather than a list. 1479 1480 So the non-inlining of lone_variables should only apply if the 1481 unfolding is regarded as cheap; because that is when exprIsConApp_maybe 1482 looks through the unfolding. Hence the "&& is_wf" in the 1483 InlineRule branch. 1484 1485 * Even a type application or coercion isn't a lone variable. 1486 Consider 1487 case $fMonadST @ RealWorld of { :DMonad a b c -> c } 1488 We had better inline that sucker! The case won't see through it. 1489 1490 For now, I'm treating treating a variable applied to types 1491 in a *lazy* context "lone". The motivating example was 1492 f = /\a. \x. BIG 1493 g = /\a. \y. h (f a) 1494 There's no advantage in inlining f here, and perhaps 1495 a significant disadvantage. Hence some_val_args in the Stop case 1496 1497Note [Interaction of exprIsWorkFree and lone variables] 1498~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1499The lone-variable test says "don't inline if a case expression 1500scrutinises a lone variable whose unfolding is cheap". It's very 1501important that, under these circumstances, exprIsConApp_maybe 1502can spot a constructor application. So, for example, we don't 1503consider 1504 let x = e in (x,x) 1505to be cheap, and that's good because exprIsConApp_maybe doesn't 1506think that expression is a constructor application. 1507 1508In the 'not (lone_variable && is_wf)' test, I used to test is_value 1509rather than is_wf, which was utterly wrong, because the above 1510expression responds True to exprIsHNF, which is what sets is_value. 1511 1512This kind of thing can occur if you have 1513 1514 {-# INLINE foo #-} 1515 foo = let x = e in (x,x) 1516 1517which Roman did. 1518 1519 1520-} 1521 1522computeDiscount :: [Int] -> Int -> [ArgSummary] -> CallCtxt 1523 -> Int 1524computeDiscount arg_discounts res_discount arg_infos cont_info 1525 1526 = 10 -- Discount of 10 because the result replaces the call 1527 -- so we count 10 for the function itself 1528 1529 + 10 * length actual_arg_discounts 1530 -- Discount of 10 for each arg supplied, 1531 -- because the result replaces the call 1532 1533 + total_arg_discount + res_discount' 1534 where 1535 actual_arg_discounts = zipWith mk_arg_discount arg_discounts arg_infos 1536 total_arg_discount = sum actual_arg_discounts 1537 1538 mk_arg_discount _ TrivArg = 0 1539 mk_arg_discount _ NonTrivArg = 10 1540 mk_arg_discount discount ValueArg = discount 1541 1542 res_discount' 1543 | LT <- arg_discounts `compareLength` arg_infos 1544 = res_discount -- Over-saturated 1545 | otherwise 1546 = case cont_info of 1547 BoringCtxt -> 0 1548 CaseCtxt -> res_discount -- Presumably a constructor 1549 ValAppCtxt -> res_discount -- Presumably a function 1550 _ -> 40 `min` res_discount 1551 -- ToDo: this 40 `min` res_discount doesn't seem right 1552 -- for DiscArgCtxt it shouldn't matter because the function will 1553 -- get the arg discount for any non-triv arg 1554 -- for RuleArgCtxt we do want to be keener to inline; but not only 1555 -- constructor results 1556 -- for RhsCtxt I suppose that exposing a data con is good in general 1557 -- And 40 seems very arbitrary 1558 -- 1559 -- res_discount can be very large when a function returns 1560 -- constructors; but we only want to invoke that large discount 1561 -- when there's a case continuation. 1562 -- Otherwise we, rather arbitrarily, threshold it. Yuk. 1563 -- But we want to avoid inlining large functions that return 1564 -- constructors into contexts that are simply "interesting" 1565