1%%% 2%%% Copyright 2011, Boundary 3%%% 4%%% Licensed under the Apache License, Version 2.0 (the "License"); 5%%% you may not use this file except in compliance with the License. 6%%% You may obtain a copy of the License at 7%%% 8%%% http://www.apache.org/licenses/LICENSE-2.0 9%%% 10%%% Unless required by applicable law or agreed to in writing, software 11%%% distributed under the License is distributed on an "AS IS" BASIS, 12%%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13%%% See the License for the specific language governing permissions and 14%%% limitations under the License. 15%%% 16 17 18%%%------------------------------------------------------------------- 19%%% File: bear.erl 20%%% @author joe williams <j@boundary.com> 21%%% @doc 22%%% statistics functions for calucating based on id and a list of values 23%%% @end 24%%%------------------------------------------------------------------ 25 26-module(bear). 27 28-compile([export_all]). 29 30-export([ 31 get_statistics/1, 32 get_statistics/2 33 ]). 34 35-define(HIST_BINS, 10). 36 37-define(STATS_MIN, 5). 38 39-record(scan_result, {n=0, sumX=0, sumXX=0, sumInv=0, sumLog, max, min}). 40-record(scan_result2, {x2=0, x3=0, x4=0}). 41 42 43get_statistics([_,_,_,_,_|_] = Values) -> 44 Scan_res = scan_values(Values), 45 Scan_res2 = scan_values2(Values, Scan_res), 46 Variance = variance(Scan_res, Scan_res2), 47 SortedValues = lists:sort(Values), 48 [ 49 {min, Scan_res#scan_result.min}, 50 {max, Scan_res#scan_result.max}, 51 {arithmetic_mean, arithmetic_mean(Scan_res)}, 52 {geometric_mean, geometric_mean(Scan_res)}, 53 {harmonic_mean, harmonic_mean(Scan_res)}, 54 {median, percentile(SortedValues, Scan_res, 0.5)}, 55 {variance, Variance}, 56 {standard_deviation, std_deviation(Scan_res, Scan_res2)}, 57 {skewness, skewness(Scan_res, Scan_res2)}, 58 {kurtosis, kurtosis(Scan_res, Scan_res2)}, 59 {percentile, 60 [ 61 {50, percentile(SortedValues, Scan_res, 0.50)}, 62 {75, percentile(SortedValues, Scan_res, 0.75)}, 63 {90, percentile(SortedValues, Scan_res, 0.90)}, 64 {95, percentile(SortedValues, Scan_res, 0.95)}, 65 {99, percentile(SortedValues, Scan_res, 0.99)}, 66 {999, percentile(SortedValues, Scan_res, 0.999)} 67 ] 68 }, 69 {histogram, get_histogram(Values, Scan_res, Scan_res2)}, 70 {n, Scan_res#scan_result.n} 71 ]; 72get_statistics(Values) when is_list(Values) -> 73 [ 74 {min, 0.0}, 75 {max, 0.0}, 76 {arithmetic_mean, 0.0}, 77 {geometric_mean, 0.0}, 78 {harmonic_mean, 0.0}, 79 {median, 0.0}, 80 {variance, 0.0}, 81 {standard_deviation, 0.0}, 82 {skewness, 0.0}, 83 {kurtosis, 0.0}, 84 {percentile, 85 [ 86 {50, 0.0}, 87 {75, 0.0}, 88 {90, 0.0}, 89 {95, 0.0}, 90 {99, 0.0}, 91 {999, 0.0} 92 ] 93 }, 94 {histogram, [{0, 0}]}, 95 {n, 0} 96 ]. 97 98get_statistics_subset([_,_,_,_,_|_] = Values, Items) -> 99 Length = length(Values), 100 SortedValues = lists:sort(Values), 101 Steps = calc_steps(Items), 102 Scan_res = if Steps > 1 -> scan_values(Values); 103 true -> [] 104 end, 105 Scan_res2 = if Steps > 2 -> scan_values2(Values, Scan_res); 106 true -> [] 107 end, 108 report_subset(Items, Length, SortedValues, Scan_res, Scan_res2); 109get_statistics_subset(Values, Items) when is_list(Values) -> 110 get_null_statistics_subset(Items, []). 111 112get_null_statistics_subset([{percentile, Ps}|Items], Acc) -> 113 get_null_statistics_subset(Items, [{percentile, [{P, 0.0} || P <- Ps]}|Acc]); 114get_null_statistics_subset([I|Items], Acc) -> 115 get_null_statistics_subset(Items, [{I, 0.0}|Acc]); 116get_null_statistics_subset([], Acc) -> 117 lists:reverse(Acc). 118 119calc_steps(Items) -> 120 lists:foldl( 121 fun({I,_},Acc) -> 122 erlang:max(level(I), Acc); 123 (I,Acc) -> 124 erlang:max(level(I), Acc) 125 end, 1, Items). 126 127level(standard_deviation) -> 3; 128level(variance ) -> 3; 129level(skewness ) -> 3; 130level(kurtosis ) -> 3; 131level(histogram ) -> 3; 132level(arithmetic_mean ) -> 2; 133level(geometric_mean ) -> 2; 134level(harmonic_mean ) -> 2; 135level(_) -> 1. 136 137report_subset(Items, N, SortedValues, Scan_res, Scan_res2) -> 138 lists:map( 139 fun(min) -> {min, hd(SortedValues)}; 140 (max) -> {max, lists:last(SortedValues)}; 141 (arithmetic_mean) -> {arithmetic_mean, arithmetic_mean(Scan_res)}; 142 (harmonic_mean) -> {harmonic_mean, harmonic_mean(Scan_res)}; 143 (geometric_mean) -> {geometric_mean, geometric_mean(Scan_res)}; 144 (median) -> {median, percentile(SortedValues, 145 #scan_result{n = N}, 0.5)}; 146 (variance) -> {variance, variance(Scan_res, Scan_res2)}; 147 (standard_deviation=I) -> {I, std_deviation(Scan_res, Scan_res2)}; 148 (skewness) -> {skewness, skewness(Scan_res, Scan_res2)}; 149 (kurtosis) -> {kurtosis, kurtosis(Scan_res, Scan_res2)}; 150 ({percentile,Ps}) -> {percentile, percentiles(Ps, N, SortedValues)}; 151 (histogram) -> 152 {histogram, get_histogram(SortedValues, Scan_res, Scan_res2)}; 153 (n) -> {n, N} 154 end, Items). 155 156get_statistics(Values, _) when length(Values) < ?STATS_MIN -> 157 0.0; 158get_statistics(_, Values) when length(Values) < ?STATS_MIN -> 159 0.0; 160get_statistics(Values1, Values2) when length(Values1) /= length(Values2) -> 161 0.0; 162get_statistics(Values1, Values2) -> 163 [ 164 {covariance, get_covariance(Values1, Values2)}, 165 {tau, get_kendall_correlation(Values1, Values2)}, 166 {rho, get_pearson_correlation(Values1, Values2)}, 167 {r, get_spearman_correlation(Values1, Values2)} 168 ]. 169 170%%%=================================================================== 171%%% Internal functions 172%%%=================================================================== 173 174scan_values([X|Values]) -> 175 scan_values(Values, #scan_result{n=1, sumX=X, sumXX=X*X, 176 sumLog=math_log(X), 177 max=X, min=X, sumInv=inverse(X)}). 178 179scan_values([X|Values], 180 #scan_result{n=N, sumX=SumX, sumXX=SumXX, sumLog=SumLog, 181 max=Max, min=Min, sumInv=SumInv}=Acc) -> 182 scan_values(Values, 183 Acc#scan_result{n=N+1, sumX=SumX+X, sumXX=SumXX+X*X, 184 sumLog=SumLog+math_log(X), 185 max=max(X,Max), min=min(X,Min), 186 sumInv=SumInv+inverse(X)}); 187scan_values([], Acc) -> 188 Acc. 189 190scan_values2(Values, #scan_result{n=N, sumX=SumX}) -> 191 scan_values2(Values, SumX/N, #scan_result2{}). 192 193scan_values2([X|Values], Mean, #scan_result2{x2=X2, x3=X3, x4=X4}=Acc) -> 194 Diff = X-Mean, 195 Diff2 = Diff*Diff, 196 Diff3 = Diff2*Diff, 197 Diff4 = Diff2*Diff2, 198 scan_values2(Values, Mean, Acc#scan_result2{x2=X2+Diff2, x3=X3+Diff3, 199 x4=X4+Diff4}); 200scan_values2([], _, Acc) -> 201 Acc. 202 203 204arithmetic_mean(#scan_result{n=N, sumX=Sum}) -> 205 Sum/N. 206 207geometric_mean(#scan_result{n=N, sumLog=SumLog}) -> 208 math:exp(SumLog/N). 209 210harmonic_mean(#scan_result{sumInv=Zero}) when Zero =:= 0 orelse 211 Zero =:= 0.0 -> 212 %% Protect against divide by 0 if we have all 0 values 213 0; 214harmonic_mean(#scan_result{n=N, sumInv=Sum}) -> 215 N/Sum. 216 217percentile(SortedValues, #scan_result{n=N}, Percentile) 218 when is_list(SortedValues) -> 219 Element = round(Percentile * N), 220 lists:nth(Element, SortedValues). 221 222%% Two pass variance 223%% Results match those given by the 'var' function in R 224variance(#scan_result{n=N}, #scan_result2{x2=X2}) -> 225 X2/(N-1). 226 227std_deviation(Scan_res, Scan_res2) -> 228 math:sqrt(variance(Scan_res, Scan_res2)). 229 230%% http://en.wikipedia.org/wiki/Skewness 231%% 232%% skewness results should match this R function: 233%% skewness <- function(x) { 234%% m3 <- mean((x - mean(x))^3) 235%% skew <- m3 / (sd(x)^3) 236%% skew 237%% } 238skewness(#scan_result{n=N}=Scan_res, #scan_result2{x3=X3}=Scan_res2) -> 239 case math:pow(std_deviation(Scan_res,Scan_res2), 3) of 240 0.0 -> 241 0.0; %% Is this really the correct thing to do here? 242 Else -> 243 (X3/N)/Else 244 end. 245 246%% http://en.wikipedia.org/wiki/Kurtosis 247%% 248%% results should match this R function: 249%% kurtosis <- function(x) { 250%% m4 <- mean((x - mean(x))^4) 251%% kurt <- m4 / (sd(x)^4) - 3 252%% kurt 253%% } 254kurtosis(#scan_result{n=N}=Scan_res, #scan_result2{x4=X4}=Scan_res2) -> 255 case math:pow(std_deviation(Scan_res,Scan_res2), 4) of 256 0.0 -> 257 0.0; %% Is this really the correct thing to do here? 258 Else -> 259 ((X4/N)/Else) - 3 260 end. 261 262get_histogram(Values, Scan_res, Scan_res2) -> 263 Bins = get_hist_bins(Scan_res#scan_result.min, 264 Scan_res#scan_result.max, 265 std_deviation(Scan_res, Scan_res2), 266 length(Values) 267 ), 268 269 Dict = lists:foldl(fun (Value, Dict) -> 270 update_bin(Value, Bins, Dict) 271 end, 272 dict:from_list([{Bin, 0} || Bin <- Bins]), 273 Values), 274 275 lists:sort(dict:to_list(Dict)). 276 277update_bin(Value, [Bin|_Bins], Dict) when Value =< Bin -> 278 dict:update_counter(Bin, 1, Dict); 279update_bin(Values, [_Bin|Bins], Dict) -> 280 update_bin(Values, Bins, Dict). 281 282%% two pass covariance 283%% (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Covariance) 284%% matches results given by excel's 'covar' function 285get_covariance(Values, _) when length(Values) < ?STATS_MIN -> 286 0.0; 287get_covariance(_, Values) when length(Values) < ?STATS_MIN -> 288 0.0; 289get_covariance(Values1, Values2) when length(Values1) /= length(Values2) -> 290 0.0; 291get_covariance(Values1, Values2) -> 292 {SumX, SumY, N} = foldl2(fun (X, Y, {SumX, SumY, N}) -> 293 {SumX+X, SumY+Y, N+1} 294 end, {0,0,0}, Values1, Values2), 295 MeanX = SumX/N, 296 MeanY = SumY/N, 297 Sum = foldl2(fun (X, Y, Sum) -> 298 Sum + ((X - MeanX) * (Y - MeanY)) 299 end, 300 0, Values1, Values2), 301 Sum/N. 302 303get_kendall_correlation(Values, _) when length(Values) < ?STATS_MIN -> 304 0.0; 305get_kendall_correlation(_, Values) when length(Values) < ?STATS_MIN -> 306 0.0; 307get_kendall_correlation(Values1, Values2) when length(Values1) /= length(Values2) -> 308 0.0; 309get_kendall_correlation(Values1, Values2) -> 310 bear:kendall_correlation(Values1, Values2). 311 312get_spearman_correlation(Values, _) when length(Values) < ?STATS_MIN -> 313 0.0; 314get_spearman_correlation(_, Values) when length(Values) < ?STATS_MIN -> 315 0.0; 316get_spearman_correlation(Values1, Values2) when length(Values1) /= length(Values2) -> 317 0.0; 318get_spearman_correlation(Values1, Values2) -> 319 TR1 = ranks_of(Values1), 320 TR2 = ranks_of(Values2), 321 Numerator = 6 * foldl2(fun (X, Y, Acc) -> 322 Diff = X-Y, 323 Acc + Diff*Diff 324 end, 0, TR1,TR2), 325 N = length(Values1), 326 Denominator = math:pow(N,3)-N, 327 1-(Numerator/Denominator). 328 329ranks_of(Values) when is_list(Values) -> 330 [Fst|Rest] = revsort(Values), 331 TRs = ranks_of(Rest, [], 2, Fst, 1), 332 Dict = gb_trees:from_orddict(TRs), 333 L = lists:foldl(fun (Val, Acc) -> 334 Rank = gb_trees:get(Val, Dict), 335 [Rank|Acc] 336 end, [], Values), 337 lists:reverse(L). 338 339ranks_of([E|Es], Acc, N, P, S) -> 340 ranks_of(Es,[{P,(S+N-1)/2}|Acc], N+1, E, N); 341ranks_of([], Acc, N, P, S) -> 342 [{P,(S+N-1)/2}|Acc]. 343 344 345get_pearson_correlation(Values, _) when length(Values) < ?STATS_MIN -> 346 0.0; 347get_pearson_correlation(_, Values) when length(Values) < ?STATS_MIN -> 348 0.0; 349get_pearson_correlation(Values1, Values2) when length(Values1) /= length(Values2) -> 350 0.0; 351get_pearson_correlation(Values1, Values2) -> 352 {SumX, SumY, SumXX, SumYY, SumXY, N} = 353 foldl2(fun (X,Y,{SX, SY, SXX, SYY, SXY, N}) -> 354 {SX+X, SY+Y, SXX+X*X, SYY+Y*Y, SXY+X*Y, N+1} 355 end, {0,0,0,0,0,0}, Values1, Values2), 356 Numer = (N*SumXY) - (SumX * SumY), 357 case math:sqrt(((N*SumXX)-(SumX*SumX)) * ((N*SumYY)-(SumY*SumY))) of 358 0.0 -> 359 0.0; %% Is this really the correct thing to do here? 360 Denom -> 361 Numer/Denom 362 end. 363 364revsort(L) -> 365 lists:reverse(lists:sort(L)). 366 367%% Foldl over two lists 368foldl2(F, Acc, [I1|L1], [I2|L2]) when is_function(F,3) -> 369 foldl2(F, F(I1, I2, Acc), L1, L2); 370foldl2(_F, Acc, [], []) -> 371 Acc. 372 373%% wrapper for math:log/1 to avoid dividing by zero 374math_log(0) -> 375 1; 376math_log(0.0) -> 377 1.0; 378math_log(X) when X < 0 -> 379 0; % it's not possible to take a log of a negative number, return 0 380math_log(X) -> 381 math:log(X). 382 383%% wrapper for calculating inverse to avoid dividing by zero 384inverse(0) -> 385 0; 386inverse(0.0) -> 387 0.0; 388inverse(X) -> 389 1/X. 390 391get_hist_bins(Min, Max, StdDev, Count) -> 392 BinWidth = get_bin_width(StdDev, Count), 393 BinCount = get_bin_count(Min, Max, BinWidth), 394 case get_bin_list(BinWidth, BinCount, []) of 395 List when length(List) =< 1 -> 396 [Max]; 397 Bins -> 398 %% add Min to Bins 399 [Bin + Min || Bin <- Bins] 400 end. 401 402get_bin_list(Width, Bins, Acc) when Bins > length(Acc) -> 403 Bin = ((length(Acc) + 1) * Width ), 404 get_bin_list(Width, Bins, [round_bin(Bin)| Acc]); 405get_bin_list(_, _, Acc) -> 406 lists:usort(Acc). 407 408round_bin(Bin) -> 409 Base = case erlang:trunc(math:pow(10, round(math:log10(Bin) - 1))) of 410 0 -> 411 1; 412 Else -> 413 Else 414 end, 415 %io:format("bin ~p, base ~p~n", [Bin, Base]), 416 round_bin(Bin, Base). 417 418round_bin(Bin, Base) when Bin rem Base == 0 -> 419 Bin; 420round_bin(Bin, Base) -> 421 Bin + Base - (Bin rem Base). 422 423% the following is up for debate as far as what the best method 424% of choosing bin counts and widths. these seem to work *good enough* 425% in my testing 426 427% bin width based on Sturges 428% http://www.jstor.org/pss/2965501 429get_bin_width(StdDev, Count) -> 430 %io:format("stddev: ~p, count: ~p~n", [StdDev, Count]), 431 case round((3.5 * StdDev) / math:pow(Count, 0.3333333)) of 432 0 -> 433 1; 434 Else -> 435 Else 436 end. 437 438% based on the simple ceilng function at 439% http://en.wikipedia.org/wiki/Histograms#Number_of_bins_and_width 440% with a modification to attempt to get on bin beyond the max value 441get_bin_count(Min, Max, Width) -> 442 %io:format("min: ~p, max: ~p, width ~p~n", [Min, Max, Width]), 443 round((Max - Min) / Width) + 1. 444 445%% taken from http://crunchyd.com/scutil/ 446%% All code here is MIT Licensed 447%% http://scutil.com/license.html 448 449% seems to match the value returned by the 'cor' (method="kendal") R function 450% http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient 451kendall_correlation(List1, List2) when is_list(List1), is_list(List2) -> 452 {RA,_} = lists:unzip(tied_ordered_ranking(List1)), 453 {RB,_} = lists:unzip(tied_ordered_ranking(List2)), 454 455 Ordering = lists:keysort(1, lists:zip(RA,RB)), 456 {_,OrdB} = lists:unzip(Ordering), 457 458 N = length(List1), 459 P = lists:sum(kendall_right_of(OrdB, [])), 460 461 -(( (4*P) / (N * (N - 1))) - 1). 462 463simple_ranking(List) when is_list(List) -> 464 lists:zip(lists:seq(1,length(List)),lists:reverse(lists:sort(List))). 465 466tied_ranking(List) -> 467 tied_rank_worker(simple_ranking(List), [], no_prev_value). 468 469tied_ordered_ranking(List) when is_list(List) -> 470 tied_ordered_ranking(List, tied_ranking(List), []). 471 472tied_ordered_ranking([], [], Work) -> 473 lists:reverse(Work); 474 475tied_ordered_ranking([Front|Rem], Ranks, Work) -> 476 {value,Item} = lists:keysearch(Front,2,Ranks), 477 {IRank,Front} = Item, 478 tied_ordered_ranking(Rem, Ranks--[Item], [{IRank,Front}]++Work). 479 480kendall_right_of([], Work) -> 481 lists:reverse(Work); 482kendall_right_of([F|R], Work) -> 483 kendall_right_of(R, [kendall_right_of_item(F,R)]++Work). 484 485kendall_right_of_item(B, Rem) -> 486 length([R || R <- Rem, R < B]). 487 488tied_add_prev(Work, {FoundAt, NewValue}) -> 489 lists:duplicate( length(FoundAt), {lists:sum(FoundAt)/length(FoundAt), NewValue} ) ++ Work. 490 491tied_rank_worker([], Work, PrevValue) -> 492 lists:reverse(tied_add_prev(Work, PrevValue)); 493 494tied_rank_worker([Item|Remainder], Work, PrevValue) -> 495 case PrevValue of 496 no_prev_value -> 497 {BaseRank,BaseVal} = Item, 498 tied_rank_worker(Remainder, Work, {[BaseRank],BaseVal}); 499 {FoundAt,OldVal} -> 500 case Item of 501 {Id,OldVal} -> 502 tied_rank_worker(Remainder, Work, {[Id]++FoundAt,OldVal}); 503 {Id,NewVal} -> 504 tied_rank_worker(Remainder, tied_add_prev(Work, PrevValue), {[Id],NewVal}) 505 506 end 507 end. 508 509 510percentiles(Ps, N, Values) -> 511 Items = [{P, perc(P, N)} || P <- Ps], 512 pick_items(Values, 1, Items). 513 514pick_items([H|_] = L, P, [{Tag,P}|Ps]) -> 515 [{Tag,H} | pick_items(L, P, Ps)]; 516pick_items([_|T], P, Ps) -> 517 pick_items(T, P+1, Ps); 518pick_items([], _, Ps) -> 519 [{Tag,undefined} || {Tag,_} <- Ps]. 520 521perc(P, Len) when is_integer(P), 0 =< P, P =< 100 -> 522 V = round(P * Len / 100), 523 erlang:max(1, V); 524perc(P, Len) when is_integer(P), 100 =< P, P =< 1000 -> 525 V = round(P * Len / 1000), 526 erlang:max(1, V); 527perc(P, Len) when is_float(P), 0 =< P, P =< 1 -> 528 erlang:max(1, round(P * Len)). 529