1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.data;
22 
23 import java.io.IOException;
24 import java.nio.ByteBuffer;
25 import java.util.Arrays;
26 
27 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter;
28 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
29 import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil;
30 import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer;
31 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
32 
33 import it.unimi.dsi.fastutil.ints.Int2DoubleMap;
34 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap;
35 import it.unimi.dsi.fastutil.objects.ObjectIterator;
36 
37 /**
38  * Sparse vector type, using {@code int[]} for storing the values, and
39  * {@code int[]} for storing the indexes, approximately 8 bytes per non-zero
40  * integer value.
41  *
42  * @author Arthur Zimek
43  * @since 0.6.0
44  */
45 public class SparseIntegerVector implements SparseNumberVector {
46   /**
47    * Static instance.
48    */
49   public static final SparseIntegerVector.Factory FACTORY = new SparseIntegerVector.Factory();
50 
51   /**
52    * Serializer using varint encoding.
53    */
54   public static final ByteBufferSerializer<SparseIntegerVector> VARIABLE_SERIALIZER = new VariableSerializer();
55 
56   /**
57    * Constant value, for use in (inefficient) {@link #getValue} API.
58    */
59   private static final int INT0 = Integer.valueOf(0);
60 
61   /**
62    * Indexes of values.
63    */
64   private final int[] indexes;
65 
66   /**
67    * Stored values.
68    */
69   private final int[] values;
70 
71   /**
72    * The dimensionality of this feature vector.
73    */
74   private int dimensionality;
75 
76   /**
77    * Direct constructor.
78    *
79    * @param indexes Indexes Must be sorted!
80    * @param values Associated value.
81    * @param dimensionality "true" dimensionality
82    */
SparseIntegerVector(int[] indexes, int[] values, int dimensionality)83   public SparseIntegerVector(int[] indexes, int[] values, int dimensionality) {
84     super();
85     this.indexes = indexes;
86     this.values = values;
87     this.dimensionality = dimensionality;
88   }
89 
90   /**
91    * Create a SparseIntegerVector consisting of double values according to the
92    * specified mapping of indices and values.
93    *
94    * @param values the values to be set as values of the real vector
95    * @param dimensionality the dimensionality of this feature vector
96    * @throws IllegalArgumentException if the given dimensionality is too small
97    *         to cover the given values (i.e., the maximum index of any value not
98    *         zero is bigger than the given dimensionality)
99    */
SparseIntegerVector(Int2DoubleOpenHashMap values, int dimensionality)100   public SparseIntegerVector(Int2DoubleOpenHashMap values, int dimensionality) throws IllegalArgumentException {
101     if(values.size() > dimensionality) {
102       throw new IllegalArgumentException("values.size() > dimensionality!");
103     }
104 
105     this.indexes = new int[values.size()];
106     this.values = new int[values.size()];
107     // Import and sort the indexes
108     {
109       ObjectIterator<Int2DoubleMap.Entry> iter = values.int2DoubleEntrySet().fastIterator();
110       for(int i = 0; iter.hasNext(); i++) {
111         this.indexes[i] = iter.next().getIntKey();
112       }
113       Arrays.sort(this.indexes);
114     }
115     // Import the values accordingly
116     {
117       for(int i = 0; i < values.size(); i++) {
118         this.values[i] = (int) values.get(this.indexes[i]);
119       }
120     }
121     this.dimensionality = dimensionality;
122     final int maxdim = getMaxDim();
123     if(maxdim > dimensionality) {
124       throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ").");
125     }
126   }
127 
128   /**
129    * Get the maximum dimensionality.
130    *
131    * @return the maximum dimensionality seen
132    */
getMaxDim()133   private int getMaxDim() {
134     return (this.indexes.length == 0) ? 0 : this.indexes[this.indexes.length - 1];
135   }
136 
137   /**
138    * Create a SparseIntegerVector consisting of double values according to the
139    * specified mapping of indices and values.
140    *
141    * @param values the values to be set as values of the real vector
142    * @throws IllegalArgumentException if the given dimensionality is too small
143    *         to cover the given values (i.e., the maximum index of any value not
144    *         zero is bigger than the given dimensionality)
145    */
SparseIntegerVector(int[] values)146   public SparseIntegerVector(int[] values) throws IllegalArgumentException {
147     this.dimensionality = values.length;
148 
149     // Count the number of non-zero entries
150     int size = 0;
151     {
152       for(int i = 0; i < values.length; i++) {
153         if(values[i] != 0) {
154           size++;
155         }
156       }
157     }
158     this.indexes = new int[size];
159     this.values = new int[size];
160 
161     // Copy the values
162     {
163       int pos = 0;
164       for(int i = 0; i < values.length; i++) {
165         int value = values[i];
166         if(value != 0) {
167           this.indexes[pos] = i;
168           this.values[pos] = value;
169           pos++;
170         }
171       }
172     }
173   }
174 
175   @Override
getDimensionality()176   public int getDimensionality() {
177     return dimensionality;
178   }
179 
180   /**
181    * Sets the dimensionality to the new value.
182    *
183    *
184    * @param dimensionality the new dimensionality
185    * @throws IllegalArgumentException if the given dimensionality is too small
186    *         to cover the given values (i.e., the maximum index of any value not
187    *         zero is bigger than the given dimensionality)
188    */
189   @Override
setDimensionality(int dimensionality)190   public void setDimensionality(int dimensionality) throws IllegalArgumentException {
191     final int maxdim = getMaxDim();
192     if(maxdim > dimensionality) {
193       throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ").");
194     }
195     this.dimensionality = dimensionality;
196   }
197 
198   @Override
199   @Deprecated
getValue(int dimension)200   public Integer getValue(int dimension) {
201     int pos = Arrays.binarySearch(this.indexes, dimension);
202     return (pos >= 0) ? values[pos] : INT0;
203   }
204 
205   @Override
206   @Deprecated
doubleValue(int dimension)207   public double doubleValue(int dimension) {
208     int pos = Arrays.binarySearch(this.indexes, dimension);
209     return (pos >= 0) ? values[pos] : 0.;
210   }
211 
212   @Override
213   @Deprecated
longValue(int dimension)214   public long longValue(int dimension) {
215     int pos = Arrays.binarySearch(this.indexes, dimension);
216     return (pos >= 0) ? values[pos] : 0L;
217   }
218 
219   @Override
220   @Deprecated
intValue(int dimension)221   public int intValue(int dimension) {
222     int pos = Arrays.binarySearch(this.indexes, dimension);
223     return (pos >= 0) ? values[pos] : 0;
224   }
225 
226   @Override
toArray()227   public double[] toArray() {
228     double[] vals = new double[dimensionality];
229     for(int i = 0; i < indexes.length; i++) {
230       vals[this.indexes[i]] = this.values[i];
231     }
232     return vals;
233   }
234 
235   /**
236    * Create a String representation of this SparseIntegerVector as suitable for
237    * {@link de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser}
238    * .
239    *
240    * The returned String is a single line with entries separated by
241    * {@link NumberVector#ATTRIBUTE_SEPARATOR}. The first entry gives the
242    * number of values actually not zero. Following entries are pairs of Integer
243    * and Integer where the Integer gives the index of the dimensionality and the
244    * Integer gives the corresponding value.
245    *
246    * Example: a vector (0,1.2,1.3,0)<sup>T</sup> would result in the String<br>
247    * <code>2 2 1.2 3 1.3</code><br>
248    *
249    * @return a String representation of this SparseIntegerVector
250    */
251   @Override
toString()252   public String toString() {
253     StringBuilder featureLine = new StringBuilder(14 * this.indexes.length)//
254         .append(this.indexes.length);
255     for(int i = 0; i < this.indexes.length; i++) {
256       featureLine.append(ATTRIBUTE_SEPARATOR).append(this.indexes[i])//
257           .append(ATTRIBUTE_SEPARATOR).append(this.values[i]);
258     }
259     return featureLine.toString();
260   }
261 
262   @Override
iterDim(int iter)263   public int iterDim(int iter) {
264     return indexes[iter];
265   }
266 
267   @Override
iterValid(int iter)268   public boolean iterValid(int iter) {
269     return iter < indexes.length;
270   }
271 
272   @Override
iterDoubleValue(int iter)273   public double iterDoubleValue(int iter) {
274     return (double) values[iter];
275   }
276 
277   @Override
iterIntValue(int iter)278   public int iterIntValue(int iter) {
279     return values[iter];
280   }
281 
282   @Override
iterLongValue(int iter)283   public long iterLongValue(int iter) {
284     return (long) values[iter];
285   }
286 
287   /**
288    * Factory class.
289    *
290    * @author Erich Schubert
291    *
292    * @has - - - SparseIntegerVector
293    */
294   public static class Factory implements SparseNumberVector.Factory<SparseIntegerVector> {
295     @Override
newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter)296     public <A> SparseIntegerVector newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter) {
297       int dim = adapter.size(array);
298       int[] values = new int[dim];
299       for(int i = 0; i < dim; i++) {
300         values[i] = adapter.get(array, i).intValue();
301       }
302       // TODO: improve efficiency
303       return new SparseIntegerVector(values);
304     }
305 
306     @Override
newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter)307     public <A> SparseIntegerVector newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter) {
308       int dim = adapter.size(array);
309       int[] values = new int[dim];
310       for(int i = 0; i < dim; i++) {
311         values[i] = adapter.getInteger(array, i);
312       }
313       // TODO: improve efficiency
314       return new SparseIntegerVector(values);
315     }
316 
317     @Override
newNumberVector(Int2DoubleOpenHashMap values, int maxdim)318     public SparseIntegerVector newNumberVector(Int2DoubleOpenHashMap values, int maxdim) {
319       return new SparseIntegerVector(values, maxdim);
320     }
321 
322     @Override
getDefaultSerializer()323     public ByteBufferSerializer<SparseIntegerVector> getDefaultSerializer() {
324       return VARIABLE_SERIALIZER;
325     }
326 
327     @Override
getRestrictionClass()328     public Class<? super SparseIntegerVector> getRestrictionClass() {
329       return SparseIntegerVector.class;
330     }
331 
332     /**
333      * Parameterization class.
334      *
335      * @author Erich Schubert
336      */
337     public static class Parameterizer extends AbstractParameterizer {
338       @Override
makeInstance()339       protected SparseIntegerVector.Factory makeInstance() {
340         return FACTORY;
341       }
342     }
343   }
344 
345   /**
346    * Serialization class using VarInt encodings.
347    *
348    * @author Erich Schubert
349    *
350    * @assoc - serializes - SparseIntegerVector
351    */
352   public static class VariableSerializer implements ByteBufferSerializer<SparseIntegerVector> {
353     @Override
fromByteBuffer(ByteBuffer buffer)354     public SparseIntegerVector fromByteBuffer(ByteBuffer buffer) throws IOException {
355       final int dimensionality = ByteArrayUtil.readUnsignedVarint(buffer);
356       final int nonzero = ByteArrayUtil.readUnsignedVarint(buffer);
357       final int[] dims = new int[nonzero];
358       final int[] values = new int[nonzero];
359       for(int i = 0; i < nonzero; i++) {
360         dims[i] = ByteArrayUtil.readUnsignedVarint(buffer);
361         values[i] = ByteArrayUtil.readSignedVarint(buffer);
362       }
363       return new SparseIntegerVector(dims, values, dimensionality);
364     }
365 
366     @Override
toByteBuffer(ByteBuffer buffer, SparseIntegerVector vec)367     public void toByteBuffer(ByteBuffer buffer, SparseIntegerVector vec) throws IOException {
368       ByteArrayUtil.writeUnsignedVarint(buffer, vec.dimensionality);
369       ByteArrayUtil.writeUnsignedVarint(buffer, vec.values.length);
370       for(int i = 0; i < vec.values.length; i++) {
371         ByteArrayUtil.writeUnsignedVarint(buffer, vec.indexes[i]);
372         ByteArrayUtil.writeSignedVarint(buffer, vec.values[i]);
373       }
374     }
375 
376     @Override
getByteSize(SparseIntegerVector vec)377     public int getByteSize(SparseIntegerVector vec) {
378       int sum = 0;
379       sum += ByteArrayUtil.getUnsignedVarintSize(vec.dimensionality);
380       sum += ByteArrayUtil.getUnsignedVarintSize(vec.values.length);
381       for(int i = 0; i < vec.values.length; i++) {
382         sum += ByteArrayUtil.getUnsignedVarintSize(vec.indexes[i]);
383         sum += ByteArrayUtil.getSignedVarintSize(vec.values[i]);
384       }
385       return sum;
386     }
387   }
388 }
389