1%%%
2%%% Copyright 2011, Boundary
3%%%
4%%% Licensed under the Apache License, Version 2.0 (the "License");
5%%% you may not use this file except in compliance with the License.
6%%% You may obtain a copy of the License at
7%%%
8%%%     http://www.apache.org/licenses/LICENSE-2.0
9%%%
10%%% Unless required by applicable law or agreed to in writing, software
11%%% distributed under the License is distributed on an "AS IS" BASIS,
12%%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13%%% See the License for the specific language governing permissions and
14%%% limitations under the License.
15%%%
16
17
18%%%-------------------------------------------------------------------
19%%% File:      bear.erl
20%%% @author    joe williams <j@boundary.com>
21%%% @doc
22%%% statistics functions for calucating based on id and a list of values
23%%% @end
24%%%------------------------------------------------------------------
25
26-module(bear).
27
28-compile([export_all]).
29
30-export([
31         get_statistics/1,
32         get_statistics/2
33        ]).
34
35-define(HIST_BINS, 10).
36
37-define(STATS_MIN, 5).
38
39-record(scan_result, {n=0, sumX=0, sumXX=0, sumInv=0, sumLog, max, min}).
40-record(scan_result2, {x2=0, x3=0, x4=0}).
41
42
43get_statistics([_,_,_,_,_|_] = Values) ->
44    Scan_res = scan_values(Values),
45    Scan_res2 = scan_values2(Values, Scan_res),
46    Variance = variance(Scan_res, Scan_res2),
47    SortedValues = lists:sort(Values),
48    [
49     {min, Scan_res#scan_result.min},
50     {max, Scan_res#scan_result.max},
51     {arithmetic_mean, arithmetic_mean(Scan_res)},
52     {geometric_mean, geometric_mean(Scan_res)},
53     {harmonic_mean, harmonic_mean(Scan_res)},
54     {median, percentile(SortedValues, Scan_res, 0.5)},
55     {variance, Variance},
56     {standard_deviation, std_deviation(Scan_res, Scan_res2)},
57     {skewness, skewness(Scan_res, Scan_res2)},
58     {kurtosis, kurtosis(Scan_res, Scan_res2)},
59     {percentile,
60      [
61       {50, percentile(SortedValues, Scan_res, 0.50)},
62       {75, percentile(SortedValues, Scan_res, 0.75)},
63       {90, percentile(SortedValues, Scan_res, 0.90)},
64       {95, percentile(SortedValues, Scan_res, 0.95)},
65       {99, percentile(SortedValues, Scan_res, 0.99)},
66       {999, percentile(SortedValues, Scan_res, 0.999)}
67      ]
68     },
69     {histogram, get_histogram(Values, Scan_res, Scan_res2)},
70     {n, Scan_res#scan_result.n}
71    ];
72get_statistics(Values) when is_list(Values) ->
73    [
74     {min, 0.0},
75     {max, 0.0},
76     {arithmetic_mean, 0.0},
77     {geometric_mean, 0.0},
78     {harmonic_mean, 0.0},
79     {median, 0.0},
80     {variance, 0.0},
81     {standard_deviation, 0.0},
82     {skewness, 0.0},
83     {kurtosis, 0.0},
84     {percentile,
85      [
86       {50, 0.0},
87       {75, 0.0},
88       {90, 0.0},
89       {95, 0.0},
90       {99, 0.0},
91       {999, 0.0}
92      ]
93     },
94     {histogram, [{0, 0}]},
95     {n, 0}
96    ].
97
98get_statistics_subset([_,_,_,_,_|_] = Values, Items) ->
99    Length = length(Values),
100    SortedValues = lists:sort(Values),
101    Steps = calc_steps(Items),
102    Scan_res = if Steps > 1 -> scan_values(Values);
103        true -> []
104    end,
105    Scan_res2 = if Steps > 2 -> scan_values2(Values, Scan_res);
106        true -> []
107    end,
108    report_subset(Items, Length, SortedValues, Scan_res, Scan_res2);
109get_statistics_subset(Values, Items) when is_list(Values) ->
110    get_null_statistics_subset(Items, []).
111
112get_null_statistics_subset([{percentile, Ps}|Items], Acc) ->
113    get_null_statistics_subset(Items, [{percentile, [{P, 0.0} || P <- Ps]}|Acc]);
114get_null_statistics_subset([I|Items], Acc) ->
115    get_null_statistics_subset(Items, [{I, 0.0}|Acc]);
116get_null_statistics_subset([], Acc) ->
117    lists:reverse(Acc).
118
119calc_steps(Items) ->
120    lists:foldl(
121        fun({I,_},Acc) ->
122            erlang:max(level(I), Acc);
123           (I,Acc) ->
124            erlang:max(level(I), Acc)
125    end, 1, Items).
126
127level(standard_deviation) -> 3;
128level(variance          ) -> 3;
129level(skewness          ) -> 3;
130level(kurtosis          ) -> 3;
131level(histogram         ) -> 3;
132level(arithmetic_mean   ) -> 2;
133level(geometric_mean    ) -> 2;
134level(harmonic_mean     ) -> 2;
135level(_) -> 1.
136
137report_subset(Items, N, SortedValues, Scan_res, Scan_res2) ->
138    lists:map(
139      fun(min) -> {min, hd(SortedValues)};
140         (max) -> {max, lists:last(SortedValues)};
141         (arithmetic_mean) -> {arithmetic_mean, arithmetic_mean(Scan_res)};
142         (harmonic_mean) -> {harmonic_mean, harmonic_mean(Scan_res)};
143         (geometric_mean) -> {geometric_mean, geometric_mean(Scan_res)};
144         (median) -> {median, percentile(SortedValues,
145                                         #scan_result{n = N}, 0.5)};
146         (variance) -> {variance, variance(Scan_res, Scan_res2)};
147         (standard_deviation=I) -> {I, std_deviation(Scan_res, Scan_res2)};
148         (skewness) -> {skewness, skewness(Scan_res, Scan_res2)};
149         (kurtosis) -> {kurtosis, kurtosis(Scan_res, Scan_res2)};
150         ({percentile,Ps}) -> {percentile, percentiles(Ps, N, SortedValues)};
151         (histogram) ->
152              {histogram, get_histogram(SortedValues, Scan_res, Scan_res2)};
153         (n) -> {n, N}
154      end, Items).
155
156get_statistics(Values, _) when length(Values) < ?STATS_MIN ->
157    0.0;
158get_statistics(_, Values) when length(Values) < ?STATS_MIN ->
159    0.0;
160get_statistics(Values1, Values2) when length(Values1) /= length(Values2) ->
161    0.0;
162get_statistics(Values1, Values2) ->
163    [
164     {covariance, get_covariance(Values1, Values2)},
165     {tau, get_kendall_correlation(Values1, Values2)},
166     {rho, get_pearson_correlation(Values1, Values2)},
167     {r, get_spearman_correlation(Values1, Values2)}
168    ].
169
170%%%===================================================================
171%%% Internal functions
172%%%===================================================================
173
174scan_values([X|Values]) ->
175    scan_values(Values, #scan_result{n=1, sumX=X, sumXX=X*X,
176             sumLog=math_log(X),
177             max=X, min=X, sumInv=inverse(X)}).
178
179scan_values([X|Values],
180      #scan_result{n=N, sumX=SumX, sumXX=SumXX, sumLog=SumLog,
181                   max=Max, min=Min, sumInv=SumInv}=Acc) ->
182    scan_values(Values,
183                Acc#scan_result{n=N+1, sumX=SumX+X, sumXX=SumXX+X*X,
184                                sumLog=SumLog+math_log(X),
185                                max=max(X,Max), min=min(X,Min),
186                                sumInv=SumInv+inverse(X)});
187scan_values([], Acc) ->
188    Acc.
189
190scan_values2(Values, #scan_result{n=N, sumX=SumX}) ->
191    scan_values2(Values, SumX/N, #scan_result2{}).
192
193scan_values2([X|Values], Mean, #scan_result2{x2=X2, x3=X3, x4=X4}=Acc) ->
194    Diff = X-Mean,
195    Diff2 = Diff*Diff,
196    Diff3 = Diff2*Diff,
197    Diff4 = Diff2*Diff2,
198    scan_values2(Values, Mean, Acc#scan_result2{x2=X2+Diff2, x3=X3+Diff3,
199            x4=X4+Diff4});
200scan_values2([], _, Acc) ->
201    Acc.
202
203
204arithmetic_mean(#scan_result{n=N, sumX=Sum}) ->
205    Sum/N.
206
207geometric_mean(#scan_result{n=N, sumLog=SumLog}) ->
208    math:exp(SumLog/N).
209
210harmonic_mean(#scan_result{sumInv=Zero}) when Zero =:= 0 orelse
211                                              Zero =:= 0.0 ->
212    %% Protect against divide by 0 if we have all 0 values
213    0;
214harmonic_mean(#scan_result{n=N, sumInv=Sum}) ->
215    N/Sum.
216
217percentile(SortedValues, #scan_result{n=N}, Percentile)
218  when is_list(SortedValues) ->
219    Element = round(Percentile * N),
220    lists:nth(Element, SortedValues).
221
222%% Two pass variance
223%% Results match those given by the 'var' function in R
224variance(#scan_result{n=N}, #scan_result2{x2=X2}) ->
225    X2/(N-1).
226
227std_deviation(Scan_res, Scan_res2) ->
228    math:sqrt(variance(Scan_res, Scan_res2)).
229
230%% http://en.wikipedia.org/wiki/Skewness
231%%
232%% skewness results should match this R function:
233%% skewness <- function(x) {
234%%    m3 <- mean((x - mean(x))^3)
235%%    skew <- m3 / (sd(x)^3)
236%%    skew
237%% }
238skewness(#scan_result{n=N}=Scan_res, #scan_result2{x3=X3}=Scan_res2) ->
239    case math:pow(std_deviation(Scan_res,Scan_res2), 3) of
240        0.0 ->
241            0.0;  %% Is this really the correct thing to do here?
242        Else ->
243            (X3/N)/Else
244    end.
245
246%% http://en.wikipedia.org/wiki/Kurtosis
247%%
248%% results should match this R function:
249%% kurtosis <- function(x) {
250%%     m4 <- mean((x - mean(x))^4)
251%%     kurt <- m4 / (sd(x)^4) - 3
252%%     kurt
253%% }
254kurtosis(#scan_result{n=N}=Scan_res, #scan_result2{x4=X4}=Scan_res2) ->
255    case math:pow(std_deviation(Scan_res,Scan_res2), 4) of
256        0.0 ->
257            0.0;  %% Is this really the correct thing to do here?
258        Else ->
259            ((X4/N)/Else) - 3
260    end.
261
262get_histogram(Values, Scan_res, Scan_res2) ->
263    Bins = get_hist_bins(Scan_res#scan_result.min,
264                         Scan_res#scan_result.max,
265                         std_deviation(Scan_res, Scan_res2),
266                         length(Values)
267                        ),
268
269    Dict = lists:foldl(fun (Value, Dict) ->
270             update_bin(Value, Bins, Dict)
271           end,
272           dict:from_list([{Bin, 0} || Bin <- Bins]),
273           Values),
274
275    lists:sort(dict:to_list(Dict)).
276
277update_bin(Value, [Bin|_Bins], Dict) when Value =< Bin ->
278    dict:update_counter(Bin, 1, Dict);
279update_bin(Values, [_Bin|Bins], Dict) ->
280    update_bin(Values, Bins, Dict).
281
282%% two pass covariance
283%% (http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Covariance)
284%% matches results given by excel's 'covar' function
285get_covariance(Values, _) when length(Values) < ?STATS_MIN ->
286    0.0;
287get_covariance(_, Values) when length(Values) < ?STATS_MIN ->
288    0.0;
289get_covariance(Values1, Values2) when length(Values1) /= length(Values2) ->
290    0.0;
291get_covariance(Values1, Values2) ->
292    {SumX, SumY, N} = foldl2(fun (X, Y, {SumX, SumY, N}) ->
293             {SumX+X, SumY+Y, N+1}
294           end, {0,0,0}, Values1, Values2),
295    MeanX = SumX/N,
296    MeanY = SumY/N,
297    Sum = foldl2(fun (X, Y, Sum) ->
298       Sum + ((X - MeanX) * (Y - MeanY))
299     end,
300     0, Values1, Values2),
301    Sum/N.
302
303get_kendall_correlation(Values, _) when length(Values) < ?STATS_MIN ->
304    0.0;
305get_kendall_correlation(_, Values) when length(Values) < ?STATS_MIN ->
306    0.0;
307get_kendall_correlation(Values1, Values2) when length(Values1) /= length(Values2) ->
308    0.0;
309get_kendall_correlation(Values1, Values2) ->
310    bear:kendall_correlation(Values1, Values2).
311
312get_spearman_correlation(Values, _) when length(Values) < ?STATS_MIN ->
313    0.0;
314get_spearman_correlation(_, Values) when length(Values) < ?STATS_MIN ->
315    0.0;
316get_spearman_correlation(Values1, Values2) when length(Values1) /= length(Values2) ->
317    0.0;
318get_spearman_correlation(Values1, Values2) ->
319    TR1 = ranks_of(Values1),
320    TR2 = ranks_of(Values2),
321    Numerator   = 6 * foldl2(fun (X, Y, Acc) ->
322             Diff = X-Y,
323             Acc + Diff*Diff
324           end, 0, TR1,TR2),
325    N = length(Values1),
326    Denominator = math:pow(N,3)-N,
327    1-(Numerator/Denominator).
328
329ranks_of(Values) when is_list(Values) ->
330    [Fst|Rest] = revsort(Values),
331    TRs = ranks_of(Rest, [], 2, Fst, 1),
332    Dict = gb_trees:from_orddict(TRs),
333    L = lists:foldl(fun (Val, Acc) ->
334                            Rank = gb_trees:get(Val, Dict),
335                            [Rank|Acc]
336                    end, [], Values),
337    lists:reverse(L).
338
339ranks_of([E|Es], Acc, N, P, S) ->
340    ranks_of(Es,[{P,(S+N-1)/2}|Acc], N+1, E, N);
341ranks_of([],  Acc, N, P, S) ->
342    [{P,(S+N-1)/2}|Acc].
343
344
345get_pearson_correlation(Values, _) when length(Values) < ?STATS_MIN ->
346    0.0;
347get_pearson_correlation(_, Values) when length(Values) < ?STATS_MIN ->
348    0.0;
349get_pearson_correlation(Values1, Values2) when length(Values1) /= length(Values2) ->
350    0.0;
351get_pearson_correlation(Values1, Values2) ->
352    {SumX, SumY, SumXX, SumYY, SumXY, N} =
353  foldl2(fun (X,Y,{SX, SY, SXX, SYY, SXY, N}) ->
354          {SX+X, SY+Y, SXX+X*X, SYY+Y*Y, SXY+X*Y, N+1}
355        end, {0,0,0,0,0,0}, Values1, Values2),
356    Numer = (N*SumXY) - (SumX * SumY),
357    case math:sqrt(((N*SumXX)-(SumX*SumX)) * ((N*SumYY)-(SumY*SumY))) of
358        0.0 ->
359            0.0; %% Is this really the correct thing to do here?
360        Denom ->
361            Numer/Denom
362    end.
363
364revsort(L) ->
365    lists:reverse(lists:sort(L)).
366
367%% Foldl over two lists
368foldl2(F, Acc, [I1|L1], [I2|L2]) when is_function(F,3) ->
369    foldl2(F, F(I1, I2, Acc), L1, L2);
370foldl2(_F, Acc, [], []) ->
371    Acc.
372
373%% wrapper for math:log/1 to avoid dividing by zero
374math_log(0) ->
375    1;
376math_log(0.0) ->
377    1.0;
378math_log(X) when X < 0 ->
379    0; % it's not possible to take a log of a negative number, return 0
380math_log(X) ->
381    math:log(X).
382
383%% wrapper for calculating inverse to avoid dividing by zero
384inverse(0) ->
385    0;
386inverse(0.0) ->
387    0.0;
388inverse(X) ->
389    1/X.
390
391get_hist_bins(Min, Max, StdDev, Count) ->
392    BinWidth = get_bin_width(StdDev, Count),
393    BinCount = get_bin_count(Min, Max, BinWidth),
394    case get_bin_list(BinWidth, BinCount, []) of
395        List when length(List) =< 1 ->
396            [Max];
397        Bins ->
398            %% add Min to Bins
399            [Bin + Min || Bin <- Bins]
400    end.
401
402get_bin_list(Width, Bins, Acc) when Bins > length(Acc) ->
403    Bin = ((length(Acc) + 1) * Width ),
404    get_bin_list(Width, Bins, [round_bin(Bin)| Acc]);
405get_bin_list(_, _, Acc) ->
406    lists:usort(Acc).
407
408round_bin(Bin) ->
409    Base = case erlang:trunc(math:pow(10, round(math:log10(Bin) - 1))) of
410        0 ->
411            1;
412        Else ->
413            Else
414    end,
415    %io:format("bin ~p, base ~p~n", [Bin, Base]),
416    round_bin(Bin, Base).
417
418round_bin(Bin, Base) when Bin rem Base == 0 ->
419    Bin;
420round_bin(Bin, Base) ->
421    Bin + Base - (Bin rem Base).
422
423% the following is up for debate as far as what the best method
424% of choosing bin counts and widths. these seem to work *good enough*
425% in my testing
426
427% bin width based on Sturges
428% http://www.jstor.org/pss/2965501
429get_bin_width(StdDev, Count) ->
430    %io:format("stddev: ~p, count: ~p~n", [StdDev, Count]),
431    case round((3.5 * StdDev) / math:pow(Count, 0.3333333)) of
432        0 ->
433            1;
434        Else ->
435            Else
436    end.
437
438% based on the simple ceilng function at
439% http://en.wikipedia.org/wiki/Histograms#Number_of_bins_and_width
440% with a modification to attempt to get on bin beyond the max value
441get_bin_count(Min, Max, Width) ->
442    %io:format("min: ~p, max: ~p, width ~p~n", [Min, Max, Width]),
443    round((Max - Min) / Width) + 1.
444
445%% taken from http://crunchyd.com/scutil/
446%% All code here is MIT Licensed
447%%  http://scutil.com/license.html
448
449% seems to match the value returned by the 'cor' (method="kendal") R function
450% http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient
451kendall_correlation(List1, List2) when is_list(List1), is_list(List2) ->
452    {RA,_} = lists:unzip(tied_ordered_ranking(List1)),
453    {RB,_} = lists:unzip(tied_ordered_ranking(List2)),
454
455    Ordering = lists:keysort(1, lists:zip(RA,RB)),
456    {_,OrdB} = lists:unzip(Ordering),
457
458    N = length(List1),
459    P = lists:sum(kendall_right_of(OrdB, [])),
460
461    -(( (4*P) / (N * (N - 1))) - 1).
462
463simple_ranking(List) when is_list(List) ->
464    lists:zip(lists:seq(1,length(List)),lists:reverse(lists:sort(List))).
465
466tied_ranking(List) ->
467    tied_rank_worker(simple_ranking(List), [], no_prev_value).
468
469tied_ordered_ranking(List) when is_list(List) ->
470    tied_ordered_ranking(List, tied_ranking(List), []).
471
472tied_ordered_ranking([], [], Work) ->
473    lists:reverse(Work);
474
475tied_ordered_ranking([Front|Rem], Ranks, Work) ->
476    {value,Item}  = lists:keysearch(Front,2,Ranks),
477    {IRank,Front} = Item,
478    tied_ordered_ranking(Rem, Ranks--[Item], [{IRank,Front}]++Work).
479
480kendall_right_of([], Work) ->
481    lists:reverse(Work);
482kendall_right_of([F|R], Work) ->
483    kendall_right_of(R, [kendall_right_of_item(F,R)]++Work).
484
485kendall_right_of_item(B, Rem) ->
486    length([R || R <- Rem, R < B]).
487
488tied_add_prev(Work, {FoundAt, NewValue}) ->
489    lists:duplicate( length(FoundAt), {lists:sum(FoundAt)/length(FoundAt), NewValue} ) ++ Work.
490
491tied_rank_worker([], Work, PrevValue) ->
492    lists:reverse(tied_add_prev(Work, PrevValue));
493
494tied_rank_worker([Item|Remainder], Work, PrevValue) ->
495    case PrevValue of
496        no_prev_value ->
497            {BaseRank,BaseVal} = Item,
498            tied_rank_worker(Remainder, Work, {[BaseRank],BaseVal});
499        {FoundAt,OldVal} ->
500            case Item of
501                {Id,OldVal} ->
502                    tied_rank_worker(Remainder, Work, {[Id]++FoundAt,OldVal});
503                {Id,NewVal} ->
504                    tied_rank_worker(Remainder, tied_add_prev(Work, PrevValue), {[Id],NewVal})
505
506            end
507    end.
508
509
510percentiles(Ps, N, Values) ->
511    Items = [{P, perc(P, N)} || P <- Ps],
512    pick_items(Values, 1, Items).
513
514pick_items([H|_] = L, P, [{Tag,P}|Ps]) ->
515    [{Tag,H} | pick_items(L, P, Ps)];
516pick_items([_|T], P, Ps) ->
517    pick_items(T, P+1, Ps);
518pick_items([], _, Ps) ->
519    [{Tag,undefined} || {Tag,_} <- Ps].
520
521perc(P, Len) when is_integer(P), 0 =< P, P =< 100 ->
522    V = round(P * Len / 100),
523    erlang:max(1, V);
524perc(P, Len) when is_integer(P), 100 =< P, P =< 1000 ->
525    V = round(P * Len / 1000),
526    erlang:max(1, V);
527perc(P, Len) when is_float(P), 0 =< P, P =< 1 ->
528    erlang:max(1, round(P * Len)).
529