1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.distance.distancefunction.probabilistic;
22 
23 import de.lmu.ifi.dbs.elki.data.NumberVector;
24 import de.lmu.ifi.dbs.elki.data.spatial.SpatialComparable;
25 import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
26 import de.lmu.ifi.dbs.elki.database.query.distance.SpatialPrimitiveDistanceSimilarityQuery;
27 import de.lmu.ifi.dbs.elki.database.relation.Relation;
28 import de.lmu.ifi.dbs.elki.distance.distancefunction.AbstractNumberVectorDistanceFunction;
29 import de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction;
30 import de.lmu.ifi.dbs.elki.distance.similarityfunction.NormalizedPrimitiveSimilarityFunction;
31 import de.lmu.ifi.dbs.elki.math.MathUtil;
32 import de.lmu.ifi.dbs.elki.utilities.Alias;
33 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
34 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
35 import net.jafama.FastMath;
36 
37 /**
38  * Hellinger metric / affinity / kernel, Bhattacharyya coefficient, fidelity
39  * similarity, Matusita distance, Hellinger-Kakutani metric on a probability
40  * distribution.
41  * <p>
42  * We assume vectors represent normalized probability distributions. Then
43  * \[\text{Hellinger}(\vec{x},\vec{y}):=
44  * \sqrt{\tfrac12\sum\nolimits_i \left(\sqrt{x_i}-\sqrt{y_i}\right)^2 } \]
45  * <p>
46  * The corresponding kernel / similarity is
47  * \[ K_{\text{Hellinger}}(\vec{x},\vec{y}) := \sum\nolimits_i \sqrt{x_i y_i} \]
48  * <p>
49  * If we have normalized probability distributions, we have the nice
50  * property that
51  * \( K_{\text{Hellinger}}(\vec{x},\vec{x}) = \sum\nolimits_i x_i = 1\).
52  * and therefore \( K_{\text{Hellinger}}(\vec{x},\vec{y}) \in [0:1] \).
53  * <p>
54  * Furthermore, we have the following relationship between this variant of the
55  * distance and this kernel:
56  * \[ \text{Hellinger}^2(\vec{x},\vec{y})
57  * = \tfrac12\sum\nolimits_i \left(\sqrt{x_i}-\sqrt{y_i}\right)^2
58  * = \tfrac12\sum\nolimits_i x_i + y_i - 2 \sqrt{x_i y_i} \]
59  * \[ \text{Hellinger}^2(\vec{x},\vec{y})
60  * = \tfrac12K_{\text{Hellinger}}(\vec{x},\vec{x})
61  * + \tfrac12K_{\text{Hellinger}}(\vec{y},\vec{y})
62  * - K_{\text{Hellinger}}(\vec{x},\vec{y})
63  * = 1 - K_{\text{Hellinger}}(\vec{x},\vec{y}) \]
64  * which implies \(\text{Hellinger}(\vec{x},\vec{y}) \in [0;1]\),
65  * and is very similar to the Euclidean distance and the linear kernel.
66  * <p>
67  * From this, it follows trivially that Hellinger distance corresponds
68  * to the kernel transformation
69  * \(\phi:\vec{x}\mapsto(\tfrac12\sqrt{x_1},\ldots,\tfrac12\sqrt{x_d})\).
70  * <p>
71  * Deza and Deza unfortunately also give a second definition, as:
72  * \[\text{Hellinger-Deza}(\vec{x},\vec{y}):=\sqrt{2\sum\nolimits_i
73  * \left(\sqrt{\tfrac{x_i}{\bar{x}}}-\sqrt{\tfrac{y_i}{\bar{y}}}\right)^2}\]
74  * which has a built-in normalization, and a different scaling that is no longer
75  * bound to $[0;1]$. The 2 in this definition likely should be a \(\frac12\).
76  * <p>
77  * This distance is well suited for histograms, but it is then more efficient to
78  * once normalize the histograms, apply the square roots, and then use Euclidean
79  * distance (i.e., use the "kernel trick" in reverse, materializing the
80  * transformation \(\phi\) given above).
81  * <p>
82  * Reference:
83  * <p>
84  * E. Hellinger<br>
85  * Neue Begründung der Theorie quadratischer Formen von unendlichvielen
86  * Veränderlichen<br>
87  * Journal für die reine und angewandte Mathematik
88  * <p>
89  * M.-M. Deza, E. Deza<br>
90  * Dictionary of distances
91  * <p>
92  * TODO: support acceleration for sparse vectors.
93  * <p>
94  * TODO: add a second variant, with built-in normalization.
95  *
96  * @author Erich Schubert
97  * @since 0.7.0
98  */
99 @Reference(authors = "E. Hellinger", //
100     title = "Neue Begründung der Theorie quadratischer Formen von unendlichvielen Veränderlichen", //
101     booktitle = "Journal für die reine und angewandte Mathematik", //
102     url = "http://resolver.sub.uni-goettingen.de/purl?GDZPPN002166941", //
103     bibkey = "journals/mathematik/Hellinger1909")
104 @Reference(authors = "M.-M. Deza, E. Deza", //
105     title = "Dictionary of distances", //
106     booktitle = "Dictionary of distances", //
107     url = "https://doi.org/10.1007/978-3-642-00234-2", //
108     bibkey = "doi:10.1007/978-3-642-00234-2")
109 @Alias({ "hellinger", "bhattacharyya" })
110 public class HellingerDistanceFunction extends AbstractNumberVectorDistanceFunction implements SpatialPrimitiveDistanceFunction<NumberVector>, NormalizedPrimitiveSimilarityFunction<NumberVector> {
111   /**
112    * Static instance.
113    */
114   public static final HellingerDistanceFunction STATIC = new HellingerDistanceFunction();
115 
116   /**
117    * Assertion error message.
118    */
119   private static final String NON_NEGATIVE = "Hellinger distance requires non-negative values.";
120 
121   /**
122    * Hellinger kernel. Use static instance {@link #STATIC}!
123    */
124   @Deprecated
HellingerDistanceFunction()125   public HellingerDistanceFunction() {
126     super();
127   }
128 
129   @Override
distance(final NumberVector fv1, final NumberVector fv2)130   public double distance(final NumberVector fv1, final NumberVector fv2) {
131     final int dim1 = fv1.getDimensionality(), dim2 = fv2.getDimensionality();
132     final int mindim = dim1 < dim2 ? dim1 : dim2;
133     double agg = 0.;
134     for(int d = 0; d < mindim; d++) {
135       final double v1 = fv1.doubleValue(d), v2 = fv2.doubleValue(d);
136       assert (v1 >= 0 && v2 >= 0) : NON_NEGATIVE;
137       if(v1 != v2) {
138         final double v = FastMath.sqrt(v1) - FastMath.sqrt(v2);
139         agg += v * v;
140       }
141     }
142     for(int d = mindim; d < dim1; d++) {
143       final double v1 = fv1.doubleValue(d);
144       assert (v1 >= 0) : NON_NEGATIVE;
145       agg += v1;
146     }
147     for(int d = mindim; d < dim2; d++) {
148       final double v2 = fv2.doubleValue(d);
149       assert (v2 >= 0) : NON_NEGATIVE;
150       agg += v2;
151     }
152     return MathUtil.SQRTHALF * FastMath.sqrt(agg);
153   }
154 
155   @Override
minDist(SpatialComparable mbr1, SpatialComparable mbr2)156   public double minDist(SpatialComparable mbr1, SpatialComparable mbr2) {
157     final int dim1 = mbr1.getDimensionality(), dim2 = mbr2.getDimensionality();
158     final int mindim = (dim1 < dim2) ? dim1 : dim2;
159     double agg = 0.;
160     for(int d = 0; d < mindim; d++) {
161       final double min1 = mbr1.getMin(d), max1 = mbr1.getMax(d);
162       final double min2 = mbr2.getMin(d), max2 = mbr2.getMax(d);
163       assert (min1 >= 0 && min2 >= 0) : NON_NEGATIVE;
164       if(max1 < min2) {
165         final double v = FastMath.sqrt(max1) - FastMath.sqrt(min2);
166         agg += v * v;
167       }
168       else if(max2 < min1) {
169         final double v = FastMath.sqrt(max2) - FastMath.sqrt(min1);
170         agg += v * v;
171       }
172     }
173     for(int d = mindim; d < dim1; d++) {
174       final double min1 = mbr1.getMin(d);
175       assert (min1 >= 0) : NON_NEGATIVE;
176       agg += min1;
177     }
178     for(int d = mindim; d < dim2; d++) {
179       final double min2 = mbr2.getMin(d);
180       assert (min2 >= 0) : NON_NEGATIVE;
181       agg += min2;
182     }
183     return MathUtil.SQRTHALF * FastMath.sqrt(agg);
184   }
185 
186   @Override
similarity(final NumberVector o1, final NumberVector o2)187   public double similarity(final NumberVector o1, final NumberVector o2) {
188     // TODO: accelerate sparse!
189     final int dim1 = o1.getDimensionality(), dim2 = o2.getDimensionality();
190     final int mindim = dim1 < dim2 ? dim1 : dim2;
191     double agg = 0.;
192     for(int d = 0; d < mindim; d++) {
193       final double v1 = o1.doubleValue(d), v2 = o2.doubleValue(d);
194       agg += v1 == v2 ? (v1 > 0 ? v1 : -v1) : (v1 == 0 || v2 == 0) ? 0. : FastMath.sqrt(v1 * v2);
195     }
196     return agg;
197   }
198 
199   @Override
200   public boolean isSymmetric() {
201     return true;
202   }
203 
204   @Override
205   public boolean isMetric() {
206     return true; // as this equals Euclidean in sqrt space
207   }
208 
209   @Override
210   public <T extends NumberVector> SpatialPrimitiveDistanceSimilarityQuery<T> instantiate(Relation<T> database) {
211     return new SpatialPrimitiveDistanceSimilarityQuery<>(database, this, this);
212   }
213 
214   @Override
215   public SimpleTypeInformation<? super NumberVector> getInputTypeRestriction() {
216     return NumberVector.VARIABLE_LENGTH;
217   }
218 
219   @Override
220   public boolean equals(Object obj) {
221     return obj == this || (obj != null && this.getClass().equals(obj.getClass()));
222   }
223 
224   @Override
225   public int hashCode() {
226     return getClass().hashCode();
227   }
228 
229   /**
230    * Parameterization class.
231    *
232    * @author Erich Schubert
233    */
234   public static class Parameterizer extends AbstractParameterizer {
235     @Override
236     protected HellingerDistanceFunction makeInstance() {
237       return HellingerDistanceFunction.STATIC;
238     }
239   }
240 }
241