1# stat_kernel.tcl -- 2# 3# Part of the statistics package for basic statistical analysis 4# Based on http://en.wikipedia.org/wiki/Kernel_(statistics) and 5# http://en.wikipedia.org/wiki/Kernel_density_estimation 6# 7# version 0.1: initial implementation, january 2014 8 9# kernel-density -- 10# Estimate the probability density using the kernel density 11# estimation method 12# 13# Arguments: 14# data List of univariate data 15# args List of options in the form of keyword-value pairs: 16# -weights weights: per data point the weight 17# -bandwidth value: bandwidth to be used for the estimation 18# -number value: number of bins to be returned 19# -interval {begin end}: begin and end of the interval for 20# which the density is returned 21# -kernel function: kernel to be used (gaussian, cosine, 22# epanechnikov, uniform, triangular, biweight, 23# logistic) 24# For all options more or less sensible defaults are 25# provided. 26# 27# Result: 28# A list of the bin centres, a list of the corresponding density 29# estimates and a list containing several computational parameters: 30# begin and end of the interval, mean, standard deviation and bandwidth 31# 32# Note: 33# The conditions for the kernel function are fairly weak: 34# - It should integrate to 1 35# - It should be symmetric around 0 36# 37# As for the implementation in Tcl: it should be reachable in the 38# ::math::statistics namespace. As a consequence, you can define 39# your own kernel function too. Hence there is no check. 40# 41proc ::math::statistics::kernel-density {data args} { 42 43 # 44 # Determine the basic statistics 45 # 46 set basicStats [BasicStats all $data] 47 48 set mean [lindex $basicStats 0] 49 set ndata [lindex $basicStats 3] 50 set stdev [lindex $basicStats 4] 51 52 if { $ndata < 1 } { 53 return -code error -errorcode ARG -errorinfo "Too few actual data" 54 } 55 56 # 57 # Get the options (providing defaults as needed) 58 # 59 set opt(-weights) {} 60 set opt(-number) 100 61 set opt(-kernel) gaussian 62 63 # 64 # The default bandwidth is set via a simple expression, which 65 # is supposed to be optimal for the Gaussian kernel. 66 # Perhaps a more sophisticated method should be provided as well 67 # 68 set opt(-bandwidth) [expr {1.06 * $stdev / pow($ndata,0.2)}] 69 70 # 71 # The default interval is derived from the mean and the 72 # standard deviation 73 # 74 set opt(-interval) [list [expr {$mean - 3.0 * $stdev}] [expr {$mean + 3.0 * $stdev}]] 75 76 # 77 # Retrieve the given options from $args 78 # 79 if { [llength $args] % 2 != 0 } { 80 return -code error -errorcode ARG -errorinfo "The options must all have a value" 81 } 82 array set opt $args 83 84 # 85 # Elementary checks 86 # 87 if { $opt(-bandwidth) <= 0.0 } { 88 return -code error -errorcode ARG -errorinfo "The bandwidth must be positive: $opt(-bandwidth)" 89 } 90 91 if { $opt(-number) <= 0.0 } { 92 return -code error -errorcode ARG -errorinfo "The number of bins must be positive: $opt(-number)" 93 } 94 95 if { [lindex $opt(-interval) 0] == [lindex $opt(-interval) 1] } { 96 return -code error -errorcode ARG -errorinfo "The interval has length zero: $opt(-interval)" 97 } 98 99 if { [llength [info proc $opt(-kernel)]] == 0 } { 100 return -code error -errorcode ARG -errorinfo "Unknown kernel function: $opt(-kernel)" 101 } 102 103 # 104 # Construct the weights 105 # 106 if { [llength $opt(-weights)] > 0 } { 107 if { [llength $data] != [llength $opt(-weights)] } { 108 return -code error -errorcode ARG -errorinfo "The list of weights must match the data" 109 } 110 111 set sum 0.0 112 foreach d $data w $opt(-weights) { 113 if { $d != {} } { 114 set sum [expr {$sum + $w}] 115 } 116 } 117 set scale [expr {1.0/$sum/$ndata}] 118 119 set weight {} 120 foreach w $opt(-weights) { 121 if { $d != {} } { 122 lappend weight [expr {$w / $scale}] 123 } else { 124 lappend weight {} 125 } 126 } 127 } else { 128 set weight [lrepeat [llength $data] [expr {1.0/$ndata}]] ;# Note: missing values have weight zero 129 } 130 131 # 132 # Construct the centres of the bins 133 # 134 set xbegin [lindex $opt(-interval) 0] 135 set xend [lindex $opt(-interval) 1] 136 set dx [expr {($xend - $xbegin) / double($opt(-number))}] 137 set xb [expr {$xbegin + 0.5 * $dx}] 138 set xvalue {} 139 for {set i 0} {$i < $opt(-number)} {incr i} { 140 lappend xvalue [expr {$xb + $i * $dx}] 141 } 142 143 # 144 # Construct the density function 145 # 146 set density {} 147 set scale [expr {1.0/$opt(-bandwidth)}] 148 foreach x $xvalue { 149 set sum 0.0 150 foreach d $data w $weight { 151 if { $d != {} } { 152 set kvalue [$opt(-kernel) [expr {$scale * ($x-$d)}]] 153 set sum [expr {$sum + $w * $kvalue}] 154 } 155 } 156 lappend density [expr {$sum * $scale}] 157 } 158 159 # 160 # Return the result 161 # 162 return [list $xvalue $density [list $xbegin $xend $mean $stdev $opt(-bandwidth)]] 163} 164 165# gaussian, uniform, triangular, epanechnikov, biweight, cosine, logistic -- 166# The Gaussian kernel 167# 168# Arguments: 169# x (Scaled) argument 170# 171# Result: 172# Value of the kernel 173# 174# Note: 175# The standard deviation is 1. 176# 177proc ::math::statistics::gaussian {x} { 178 return [expr {exp(-0.5*$x*$x) / sqrt(2.0*acos(-1.0))}] 179} 180proc ::math::statistics::uniform {x} { 181 if { abs($x) <= 1.0 } { 182 return 0.5 183 } else { 184 return 0.0 185 } 186} 187proc ::math::statistics::triangular {x} { 188 if { abs($x) < 1.0 } { 189 return [expr {1.0 - abs($x)}] 190 } else { 191 return 0.0 192 } 193} 194proc ::math::statistics::epanechnikov {x} { 195 if { abs($x) < 1.0 } { 196 return [expr {0.75 * (1.0 - abs($x)*abs($x))}] 197 } else { 198 return 0.0 199 } 200} 201proc ::math::statistics::biweight {x} { 202 if { abs($x) < 1.0 } { 203 return [expr {0.9375 * pow((1.0 - abs($x)*abs($x)),2)}] 204 } else { 205 return 0.0 206 } 207} 208proc ::math::statistics::cosine {x} { 209 if { abs($x) < 1.0 } { 210 return [expr {0.25 * acos(-1.0) * cos(0.5 * acos(-1.0) * $x)}] 211 } else { 212 return 0.0 213 } 214} 215proc ::math::statistics::logistic {x} { 216 return [expr {1.0 / (exp($x) + 2.0 + exp(-$x))}] 217} 218