1# stat_kernel.tcl --
2#
3#    Part of the statistics package for basic statistical analysis
4#    Based on http://en.wikipedia.org/wiki/Kernel_(statistics) and
5#             http://en.wikipedia.org/wiki/Kernel_density_estimation
6#
7# version 0.1:   initial implementation, january 2014
8
9# kernel-density --
10#     Estimate the probability density using the kernel density
11#     estimation method
12#
13# Arguments:
14#     data            List of univariate data
15#     args            List of options in the form of keyword-value pairs:
16#                     -weights weights: per data point the weight
17#                     -bandwidth value: bandwidth to be used for the estimation
18#                     -number value: number of bins to be returned
19#                     -interval {begin end}: begin and end of the interval for
20#                         which the density is returned
21#                     -kernel function: kernel to be used (gaussian, cosine,
22#                         epanechnikov, uniform, triangular, biweight,
23#                         logistic)
24#                     For all options more or less sensible defaults are
25#                     provided.
26#
27# Result:
28#     A list of the bin centres, a list of the corresponding density
29#     estimates and a list containing several computational parameters:
30#     begin and end of the interval, mean, standard deviation and bandwidth
31#
32# Note:
33#     The conditions for the kernel function are fairly weak:
34#     - It should integrate to 1
35#     - It should be symmetric around 0
36#
37#     As for the implementation in Tcl: it should be reachable in the
38#     ::math::statistics namespace. As a consequence, you can define
39#     your own kernel function too. Hence there is no check.
40#
41proc ::math::statistics::kernel-density {data args} {
42
43    #
44    # Determine the basic statistics
45    #
46    set basicStats [BasicStats all $data]
47
48    set mean       [lindex $basicStats 0]
49    set ndata      [lindex $basicStats 3]
50    set stdev      [lindex $basicStats 4]
51
52    if { $ndata < 1 } {
53        return -code error -errorcode ARG -errorinfo "Too few actual data"
54    }
55
56    #
57    # Get the options (providing defaults as needed)
58    #
59    set opt(-weights)   {}
60    set opt(-number)    100
61    set opt(-kernel)    gaussian
62
63    #
64    # The default bandwidth is set via a simple expression, which
65    # is supposed to be optimal for the Gaussian kernel.
66    # Perhaps a more sophisticated method should be provided as well
67    #
68    set opt(-bandwidth) [expr {1.06 * $stdev / pow($ndata,0.2)}]
69
70    #
71    # The default interval is derived from the mean and the
72    # standard deviation
73    #
74    set opt(-interval) [list [expr {$mean - 3.0 * $stdev}] [expr {$mean + 3.0 * $stdev}]]
75
76    #
77    # Retrieve the given options from $args
78    #
79    if { [llength $args] % 2 != 0 } {
80        return -code error -errorcode ARG -errorinfo "The options must all have a value"
81    }
82    array set opt $args
83
84    #
85    # Elementary checks
86    #
87    if { $opt(-bandwidth) <= 0.0 } {
88        return -code error -errorcode ARG -errorinfo "The bandwidth must be positive: $opt(-bandwidth)"
89    }
90
91    if { $opt(-number) <= 0.0 } {
92        return -code error -errorcode ARG -errorinfo "The number of bins must be positive: $opt(-number)"
93    }
94
95    if { [lindex $opt(-interval) 0] == [lindex $opt(-interval) 1] } {
96        return -code error -errorcode ARG -errorinfo "The interval has length zero: $opt(-interval)"
97    }
98
99    if { [llength [info proc $opt(-kernel)]] == 0 } {
100        return -code error -errorcode ARG -errorinfo "Unknown kernel function: $opt(-kernel)"
101    }
102
103    #
104    # Construct the weights
105    #
106    if { [llength $opt(-weights)] > 0 } {
107        if { [llength $data] != [llength $opt(-weights)] } {
108            return -code error -errorcode ARG -errorinfo "The list of weights must match the data"
109        }
110
111        set sum 0.0
112        foreach d $data w $opt(-weights) {
113            if { $d != {} } {
114                set sum [expr {$sum + $w}]
115            }
116        }
117        set scale [expr {1.0/$sum/$ndata}]
118
119        set weight {}
120        foreach w $opt(-weights) {
121            if { $d != {} } {
122                lappend weight [expr {$w / $scale}]
123            } else {
124                lappend weight {}
125            }
126        }
127    } else {
128        set weight [lrepeat [llength $data] [expr {1.0/$ndata}]] ;# Note: missing values have weight zero
129    }
130
131    #
132    # Construct the centres of the bins
133    #
134    set xbegin [lindex $opt(-interval) 0]
135    set xend   [lindex $opt(-interval) 1]
136    set dx     [expr {($xend - $xbegin) / double($opt(-number))}]
137    set xb     [expr {$xbegin + 0.5 * $dx}]
138    set xvalue {}
139    for {set i 0} {$i < $opt(-number)} {incr i} {
140        lappend xvalue [expr {$xb + $i * $dx}]
141    }
142
143    #
144    # Construct the density function
145    #
146    set density {}
147    set scale   [expr {1.0/$opt(-bandwidth)}]
148    foreach x $xvalue {
149        set sum 0.0
150        foreach d $data w $weight {
151            if { $d != {} } {
152                set kvalue [$opt(-kernel) [expr {$scale * ($x-$d)}]]
153                set sum [expr {$sum + $w * $kvalue}]
154            }
155        }
156        lappend density [expr {$sum * $scale}]
157    }
158
159    #
160    # Return the result
161    #
162    return [list $xvalue $density [list $xbegin $xend $mean $stdev $opt(-bandwidth)]]
163}
164
165# gaussian, uniform, triangular, epanechnikov, biweight, cosine, logistic --
166#    The Gaussian kernel
167#
168# Arguments:
169#    x            (Scaled) argument
170#
171# Result:
172#    Value of the kernel
173#
174# Note:
175#    The standard deviation is 1.
176#
177proc ::math::statistics::gaussian {x} {
178    return [expr {exp(-0.5*$x*$x) / sqrt(2.0*acos(-1.0))}]
179}
180proc ::math::statistics::uniform {x} {
181    if { abs($x) <= 1.0 } {
182        return 0.5
183    } else {
184        return 0.0
185    }
186}
187proc ::math::statistics::triangular {x} {
188    if { abs($x) < 1.0 } {
189        return [expr {1.0 - abs($x)}]
190    } else {
191        return 0.0
192    }
193}
194proc ::math::statistics::epanechnikov {x} {
195    if { abs($x) < 1.0 } {
196        return [expr {0.75 * (1.0 - abs($x)*abs($x))}]
197    } else {
198        return 0.0
199    }
200}
201proc ::math::statistics::biweight {x} {
202    if { abs($x) < 1.0 } {
203        return [expr {0.9375 * pow((1.0 - abs($x)*abs($x)),2)}]
204    } else {
205        return 0.0
206    }
207}
208proc ::math::statistics::cosine {x} {
209    if { abs($x) < 1.0 } {
210        return [expr {0.25 * acos(-1.0) * cos(0.5 * acos(-1.0) * $x)}]
211    } else {
212        return 0.0
213    }
214}
215proc ::math::statistics::logistic {x} {
216    return [expr {1.0 / (exp($x) + 2.0 + exp(-$x))}]
217}
218