1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.distance.distancefunction.set;
22 
23 import de.lmu.ifi.dbs.elki.data.BitVector;
24 import de.lmu.ifi.dbs.elki.data.FeatureVector;
25 import de.lmu.ifi.dbs.elki.data.NumberVector;
26 import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
27 import de.lmu.ifi.dbs.elki.database.query.DistanceSimilarityQuery;
28 import de.lmu.ifi.dbs.elki.database.query.distance.PrimitiveDistanceSimilarityQuery;
29 import de.lmu.ifi.dbs.elki.database.relation.Relation;
30 import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction;
31 import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
32 import de.lmu.ifi.dbs.elki.distance.similarityfunction.NormalizedPrimitiveSimilarityFunction;
33 import de.lmu.ifi.dbs.elki.utilities.Alias;
34 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
35 
36 /**
37  * A flexible extension of Jaccard similarity to non-binary vectors.
38  * <p>
39  * Jaccard coefficient is commonly defined as \(\frac{A\cap B}{A\cup B}\).
40  * <p>
41  * We can extend this definition to non-binary vectors as follows:
42  * \(\tfrac{|\{i\mid a_i = b_i\}|}{|\{i\mid a_i = 0 \wedge b_i = 0\}|}\)
43  * <p>
44  * For binary vectors, this will obviously be the same quantity. However, this
45  * version is more useful for categorical data.
46  * <p>
47  * Reference:
48  * <p>
49  * P. Jaccard<br>
50  * Distribution de la florine alpine dans la Bassin de Dranses et dans quelques
51  * regiones voisines<br>
52  * Bulletin del la Société Vaudoise des Sciences Naturelles
53  *
54  * @author Erich Schubert
55  * @since 0.6.0
56  */
57 @Reference(authors = "P. Jaccard", //
58     title = "Distribution de la florine alpine dans la Bassin de Dranses et dans quelques regiones voisines", //
59     booktitle = "Bulletin del la Société Vaudoise des Sciences Naturelles", //
60     url = "http://data.rero.ch/01-R241574160", //
61     bibkey = "journals/misc/Jaccard1902")
62 @Alias("de.lmu.ifi.dbs.elki.distance.similarityfunction.JaccardPrimitiveSimilarityFunction")
63 public class JaccardSimilarityDistanceFunction extends AbstractSetDistanceFunction<FeatureVector<?>> implements NormalizedPrimitiveSimilarityFunction<FeatureVector<?>>, NumberVectorDistanceFunction<FeatureVector<?>>, PrimitiveDistanceFunction<FeatureVector<?>> {
64   /**
65    * Constructor. No parameters.
66    */
JaccardSimilarityDistanceFunction()67   public JaccardSimilarityDistanceFunction() {
68     super();
69   }
70 
71   @Override
similarity(FeatureVector<?> o1, FeatureVector<?> o2)72   public double similarity(FeatureVector<?> o1, FeatureVector<?> o2) {
73     if(o1 instanceof BitVector && o2 instanceof BitVector) {
74       return ((BitVector) o1).jaccardSimilarity((BitVector) o2);
75     }
76     if(o1 instanceof NumberVector && o2 instanceof NumberVector) {
77       return similarityNumberVector((NumberVector) o1, (NumberVector) o2);
78     }
79     final int d1 = o1.getDimensionality(), d2 = o2.getDimensionality();
80     int intersection = 0, union = 0;
81     int d = 0;
82     for(; d < d1 && d < d2; d++) {
83       Object v1 = o1.getValue(d), v2 = o2.getValue(d);
84       final boolean n1 = isNull(v1), n2 = isNull(v2);
85       if(v1 instanceof Double && Double.isNaN((Double) v1) //
86           || v2 instanceof Double && Double.isNaN((Double) v2)) {
87         continue;
88       }
89       if(!n1 || !n2) {
90         ++union;
91         if(!n1 && v1.equals(v2)) {
92           ++intersection;
93         }
94       }
95     }
96     for(; d < d1; d++) {
97       Object v1 = o1.getValue(d);
98       if(!isNull(v1)) {
99         if(v1 instanceof Double && Double.isNaN((Double) v1)) {
100           continue;
101         }
102         ++union;
103       }
104     }
105     for(; d < d2; d++) {
106       Object v2 = o2.getValue(d);
107       if(!isNull(v2)) {
108         if(v2 instanceof Double && Double.isNaN((Double) v2)) {
109           continue;
110         }
111         ++union;
112       }
113     }
114     return intersection / (double) union;
115   }
116 
117   /**
118    * Compute Jaccard similarity for two number vectors.
119    *
120    * @param o1 First vector
121    * @param o2 Second vector
122    * @return Jaccard similarity
123    */
similarityNumberVector(NumberVector o1, NumberVector o2)124   public static double similarityNumberVector(NumberVector o1, NumberVector o2) {
125     final int d1 = o1.getDimensionality(), d2 = o2.getDimensionality();
126     int intersection = 0, union = 0;
127     int d = 0;
128     for(; d < d1 && d < d2; d++) {
129       double v1 = o1.doubleValue(d), v2 = o2.doubleValue(d);
130       if(v1 != v1 || v2 != v2) { // Skip NaNs.
131         continue;
132       }
133       if(v1 != 0. || v2 != 0) {
134         ++union;
135         if(v1 == v2) {
136           ++intersection;
137         }
138       }
139     }
140     for(; d < d1; d++) {
141       if(o1.doubleValue(d) != 0) {
142         ++union;
143       }
144     }
145     for(; d < d2; d++) {
146       if(o2.doubleValue(d) != 0) {
147         ++union;
148       }
149     }
150     return intersection / (double) union;
151   }
152 
153   @Override
distance(FeatureVector<?> o1, FeatureVector<?> o2)154   public double distance(FeatureVector<?> o1, FeatureVector<?> o2) {
155     return 1. - similarity(o1, o2);
156   }
157 
158   @Override
distance(NumberVector o1, NumberVector o2)159   public double distance(NumberVector o1, NumberVector o2) {
160     if(o1 instanceof BitVector && o2 instanceof BitVector) {
161       return 1. - ((BitVector) o1).jaccardSimilarity((BitVector) o2);
162     }
163     return 1. - similarityNumberVector(o1, o2);
164   }
165 
166   @Override
isSymmetric()167   public boolean isSymmetric() {
168     return true;
169   }
170 
171   @Override
isMetric()172   public boolean isMetric() {
173     return true;
174   }
175 
176   @Override
getInputTypeRestriction()177   public SimpleTypeInformation<? super FeatureVector<?>> getInputTypeRestriction() {
178     return FeatureVector.TYPE;
179   }
180 
181   @Override
instantiate(Relation<T> relation)182   public <T extends FeatureVector<?>> DistanceSimilarityQuery<T> instantiate(Relation<T> relation) {
183     return new PrimitiveDistanceSimilarityQuery<>(relation, this, this);
184   }
185 
186   @Override
equals(Object obj)187   public boolean equals(Object obj) {
188     return obj == this || (obj != null && this.getClass().equals(obj.getClass()));
189   }
190 
191   @Override
hashCode()192   public int hashCode() {
193     return getClass().hashCode();
194   }
195 }
196