1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.utilities.scaling.outlier; 22 23 import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; 24 import de.lmu.ifi.dbs.elki.database.ids.DBIDs; 25 import de.lmu.ifi.dbs.elki.database.relation.DoubleRelation; 26 import de.lmu.ifi.dbs.elki.logging.Logging; 27 import de.lmu.ifi.dbs.elki.math.MathUtil; 28 import de.lmu.ifi.dbs.elki.math.MeanVariance; 29 import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; 30 import de.lmu.ifi.dbs.elki.utilities.Alias; 31 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; 32 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; 33 34 import net.jafama.FastMath; 35 36 /** 37 * Tries to fit a mixture model (exponential for inliers and gaussian for 38 * outliers) to the outlier score distribution. 39 * <p> 40 * Note: we found this method to often fail, and fit the normal distribution to 41 * the inliers instead of the outliers, yielding reversed results. 42 * <p> 43 * Reference: 44 * <p> 45 * J. Gao, P.-N. Tan<br> 46 * Converting Output Scores from Outlier Detection Algorithms into Probability 47 * Estimates<br> 48 * Proc. Sixth International Conference on Data Mining, 2006. ICDM'06. 49 * 50 * @author Erich Schubert 51 * @since 0.4.0 52 */ 53 @Reference(authors = "J. Gao, P.-N. Tan", // 54 title = "Converting Output Scores from Outlier Detection Algorithms into Probability Estimates", // 55 booktitle = "Proc. Sixth International Conference on Data Mining, 2006. ICDM'06.", // 56 url = "https://doi.org/10.1109/ICDM.2006.43", // 57 bibkey = "DBLP:conf/icdm/GaoT06") 58 @Alias("de.lmu.ifi.dbs.elki.utilities.scaling.outlier.MixtureModelOutlierScalingFunction") 59 public class MixtureModelOutlierScaling implements OutlierScaling { 60 /** 61 * The logger for this class. 62 */ 63 private static final Logging LOG = Logging.getLogger(MixtureModelOutlierScaling.class); 64 65 /** 66 * Parameter mu of the gaussian distribution (outliers) 67 */ 68 protected double mu; 69 70 /** 71 * Parameter sigma of the gaussian distribution (outliers) 72 */ 73 protected double sigma; 74 75 /** 76 * Parameter lambda of the exponential distribution (inliers) 77 */ 78 protected double lambda; 79 80 /** 81 * Mixing parameter alpha 82 */ 83 protected double alpha; 84 85 /** 86 * Precomputed static value 87 */ 88 public static final double ONEBYSQRT2PI = 1.0 / MathUtil.SQRTTWOPI; 89 90 /** 91 * Convergence parameter 92 */ 93 private static final double DELTA = 0.0001; 94 95 /** 96 * Compute p_i (Gaussian distribution, outliers) 97 * 98 * @param f value 99 * @param mu Mu parameter 100 * @param sigma Sigma parameter 101 * @return probability 102 */ calcP_i(double f, double mu, double sigma)103 protected static double calcP_i(double f, double mu, double sigma) { 104 final double fmu = f - mu; 105 return ONEBYSQRT2PI / sigma * FastMath.exp(fmu * fmu / (-2 * sigma * sigma)); 106 } 107 108 /** 109 * Compute q_i (Exponential distribution, inliers) 110 * 111 * @param f value 112 * @param lambda Lambda parameter 113 * @return probability 114 */ calcQ_i(double f, double lambda)115 protected static double calcQ_i(double f, double lambda) { 116 return lambda * FastMath.exp(-lambda * f); 117 } 118 119 /** 120 * Compute the a posterior probability for the given parameters. 121 * 122 * @param f value 123 * @param alpha Alpha (mixing) parameter 124 * @param mu Mu (for gaussian) 125 * @param sigma Sigma (for gaussian) 126 * @param lambda Lambda (for exponential) 127 * @return Probability 128 */ calcPosterior(double f, double alpha, double mu, double sigma, double lambda)129 protected static double calcPosterior(double f, double alpha, double mu, double sigma, double lambda) { 130 final double pi = calcP_i(f, mu, sigma); 131 final double qi = calcQ_i(f, lambda); 132 return (alpha * pi) / (alpha * pi + (1.0 - alpha) * qi); 133 } 134 135 @Override prepare(OutlierResult or)136 public void prepare(OutlierResult or) { 137 // Initial parameters - are these defaults sounds? 138 MeanVariance mv = new MeanVariance(); 139 DoubleRelation scores = or.getScores(); 140 for(DBIDIter id = scores.iterDBIDs(); id.valid(); id.advance()) { 141 double val = scores.doubleValue(id); 142 if(!Double.isNaN(val) && !Double.isInfinite(val)) { 143 mv.put(val); 144 } 145 } 146 double curMu = mv.getMean() * 2.; 147 if(curMu == 0) { 148 curMu = Double.MIN_NORMAL; 149 } 150 double curSigma = Math.max(mv.getSampleStddev(), Double.MIN_NORMAL); 151 double curLambda = Math.min(1.0 / curMu, Double.MAX_VALUE); 152 double curAlpha = 0.05; 153 154 DBIDs ids = scores.getDBIDs(); 155 // TODO: stop condition! 156 int iter = 0; 157 while(true) { 158 // E and M-Steps 159 // Sum of weights for both distributions 160 double otisum = 0.0, itisum = 0.0; 161 // Weighted sum for both distributions 162 double owsum = 0.0, iwsum = 0.0; 163 // Weighted deviation from previous mean (Gaussian only) 164 double osqsum = 0.0; 165 for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { 166 double val = scores.doubleValue(it); 167 // E-Step: estimate outlier probability 168 double ti = calcPosterior(val, curAlpha, curMu, curSigma, curLambda); 169 // M-Step 170 otisum += ti; 171 itisum += 1 - ti; 172 owsum += ti * val; 173 iwsum += (1 - ti) * val; 174 osqsum += ti * val * val; // (val - curMu) * (val - curMu); 175 } 176 if(otisum <= 0.0 || owsum <= 0.0) { 177 LOG.warning("MixtureModel Outlier Scaling converged to extreme."); 178 break; 179 } 180 double newMu = owsum / otisum; 181 double newSigma = Math.max(FastMath.sqrt(osqsum / otisum - newMu * newMu), Double.MIN_NORMAL); 182 double newLambda = Math.min(itisum / iwsum, Double.MAX_VALUE); 183 double newAlpha = otisum / ids.size(); 184 // converged? 185 if(Math.abs(newMu - curMu) < DELTA // 186 && Math.abs(newSigma - curSigma) < DELTA // 187 && Math.abs(newLambda - curLambda) < DELTA // 188 && Math.abs(newAlpha - curAlpha) < DELTA) { 189 break; 190 } 191 if(newSigma <= 0.0 || newAlpha <= 0.0) { 192 LOG.warning("MixtureModel Outlier Scaling converged to extreme."); 193 break; 194 } 195 curMu = newMu; 196 curSigma = newSigma; 197 curLambda = newLambda; 198 curAlpha = newAlpha; 199 200 iter++; 201 if(iter > 100) { 202 LOG.warning("Max iterations met in mixture model fitting."); 203 break; 204 } 205 } 206 mu = curMu; 207 sigma = curSigma; 208 lambda = curLambda; 209 alpha = curAlpha; 210 } 211 212 @Override prepare(A array, NumberArrayAdapter<?, A> adapter)213 public <A> void prepare(A array, NumberArrayAdapter<?, A> adapter) { 214 // Initial parameters - are these defaults sounds? 215 MeanVariance mv = new MeanVariance(); 216 final int size = adapter.size(array); 217 for(int i = 0; i < size; i++) { 218 double val = adapter.getDouble(array, i); 219 if(!Double.isNaN(val) && !Double.isInfinite(val)) { 220 mv.put(val); 221 } 222 } 223 double curMu = mv.getMean() * 2.; 224 if(curMu == 0) { 225 curMu = Double.MIN_NORMAL; 226 } 227 double curSigma = Math.max(mv.getSampleStddev(), Double.MIN_NORMAL); 228 double curLambda = Math.min(1.0 / curMu, Double.MAX_VALUE); 229 double curAlpha = 0.05; 230 231 // TODO: stop condition! 232 int iter = 0; 233 while(true) { 234 // E and M-Steps 235 // Sum of weights for both distributions 236 double otisum = 0.0, itisum = 0.0; 237 // Weighted sum for both distributions 238 double owsum = 0.0, iwsum = 0.0; 239 // Weighted deviation from previous mean (Gaussian only) 240 double osqsum = 0.0; 241 for(int i = 0; i < size; i++) { 242 double val = adapter.getDouble(array, i); 243 // E-Step 244 double ti = calcPosterior(val, curAlpha, curMu, curSigma, curLambda); 245 // M-Step 246 otisum += ti; 247 itisum += 1 - ti; 248 owsum += ti * val; 249 iwsum += (1 - ti) * val; 250 osqsum += ti * val * val; // (val - curMu) * (val - curMu); 251 } 252 if(otisum <= 0.0 || owsum <= 0.0) { 253 LOG.warning("MixtureModel Outlier Scaling converged to extreme."); 254 break; 255 } 256 double newMu = owsum / otisum; 257 double newSigma = Math.max(FastMath.sqrt(osqsum / otisum - newMu * newMu), Double.MIN_NORMAL); 258 double newLambda = Math.min(itisum / iwsum, Double.MAX_VALUE); 259 double newAlpha = otisum / size; 260 // converged? 261 if(Math.abs(newMu - curMu) < DELTA // 262 && Math.abs(newSigma - curSigma) < DELTA // 263 && Math.abs(newLambda - curLambda) < DELTA // 264 && Math.abs(newAlpha - curAlpha) < DELTA) { 265 break; 266 } 267 if(newSigma <= 0.0 || newAlpha <= 0.0) { 268 LOG.warning("MixtureModel Outlier Scaling converged to extreme."); 269 break; 270 } 271 curMu = newMu; 272 curSigma = newSigma; 273 curLambda = newLambda; 274 curAlpha = newAlpha; 275 276 iter++; 277 if(iter > 100) { 278 LOG.warning("Max iterations met in mixture model fitting."); 279 break; 280 } 281 } 282 mu = curMu; 283 sigma = curSigma; 284 lambda = curLambda; 285 alpha = curAlpha; 286 } 287 288 @Override getMax()289 public double getMax() { 290 return 1.0; 291 } 292 293 @Override getMin()294 public double getMin() { 295 return 0.0; 296 } 297 298 @Override getScaled(double value)299 public double getScaled(double value) { 300 final double val = calcPosterior(value, alpha, mu, sigma, lambda); 301 // Work around issues with unstable convergence. 302 return val > 0 ? val : 0; 303 } 304 } 305