1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.data;
22 
23 import java.io.IOException;
24 import java.nio.ByteBuffer;
25 import java.util.Arrays;
26 
27 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter;
28 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
29 import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil;
30 import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer;
31 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
32 
33 import it.unimi.dsi.fastutil.ints.Int2DoubleMap;
34 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap;
35 import it.unimi.dsi.fastutil.ints.Int2FloatMap;
36 import it.unimi.dsi.fastutil.ints.Int2FloatOpenHashMap;
37 import it.unimi.dsi.fastutil.objects.ObjectIterator;
38 
39 /**
40  * Sparse vector type, using {@code float[]} for storing the values, and
41  * {@code int[]} for storing the indexes, approximately 8 bytes per non-zero
42  * value.
43  *
44  * @author Arthur Zimek
45  * @since 0.2
46  */
47 public class SparseFloatVector implements SparseNumberVector {
48   /**
49    * Static instance.
50    */
51   public static final SparseFloatVector.Factory FACTORY = new SparseFloatVector.Factory();
52 
53   /**
54    * Serializer using varint encoding.
55    */
56   public static final ByteBufferSerializer<SparseFloatVector> VARIABLE_SERIALIZER = new VariableSerializer();
57 
58   /**
59    * Float constant, for missing values in (inefficient) {@link #getValue} API.
60    */
61   private static final float FLOAT0 = 0.f;
62 
63   /**
64    * Indexes of values.
65    */
66   private final int[] indexes;
67 
68   /**
69    * Stored values.
70    */
71   private final float[] values;
72 
73   /**
74    * The dimensionality of this feature vector.
75    */
76   private int dimensionality;
77 
78   /**
79    * Direct constructor.
80    *
81    * @param indexes Indexes Must be sorted!
82    * @param values Associated value.
83    * @param dimensionality "true" dimensionality
84    */
SparseFloatVector(int[] indexes, float[] values, int dimensionality)85   public SparseFloatVector(int[] indexes, float[] values, int dimensionality) {
86     super();
87     this.indexes = indexes;
88     this.values = values;
89     this.dimensionality = dimensionality;
90   }
91 
92   /**
93    * Create a SparseFloatVector consisting of double values according to the
94    * specified mapping of indices and values.
95    *
96    * @param values the values to be set as values of the real vector
97    * @param dimensionality the dimensionality of this feature vector
98    * @throws IllegalArgumentException if the given dimensionality is too small
99    *         to cover the given values (i.e., the maximum index of any value not
100    *         zero is bigger than the given dimensionality)
101    */
SparseFloatVector(Int2FloatOpenHashMap values, int dimensionality)102   public SparseFloatVector(Int2FloatOpenHashMap values, int dimensionality) throws IllegalArgumentException {
103     if(values.size() > dimensionality) {
104       throw new IllegalArgumentException("values.size() > dimensionality!");
105     }
106 
107     this.indexes = new int[values.size()];
108     this.values = new float[values.size()];
109     // Import and sort the indexes
110     {
111       ObjectIterator<Int2FloatMap.Entry> iter = values.int2FloatEntrySet().fastIterator();
112       for(int i = 0; iter.hasNext(); i++) {
113         this.indexes[i] = iter.next().getIntKey();
114       }
115       Arrays.sort(this.indexes);
116     }
117     // Import the values accordingly
118     {
119       for(int i = 0; i < values.size(); i++) {
120         this.values[i] = values.get(this.indexes[i]);
121       }
122     }
123     this.dimensionality = dimensionality;
124     final int maxdim = getMaxDim();
125     if(maxdim > dimensionality) {
126       throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ").");
127     }
128   }
129 
130   /**
131    * Get the maximum dimensionality.
132    *
133    * @return the maximum dimensionality seen
134    */
getMaxDim()135   private int getMaxDim() {
136     return (this.indexes.length == 0) ? 0 : this.indexes[this.indexes.length - 1];
137   }
138 
139   /**
140    * Create a SparseFloatVector consisting of double values according to the
141    * specified mapping of indices and values.
142    *
143    * @param values the values to be set as values of the real vector
144    * @throws IllegalArgumentException if the given dimensionality is too small
145    *         to cover the given values (i.e., the maximum index of any value not
146    *         zero is bigger than the given dimensionality)
147    */
SparseFloatVector(float[] values)148   public SparseFloatVector(float[] values) throws IllegalArgumentException {
149     this.dimensionality = values.length;
150 
151     // Count the number of non-zero entries
152     int size = 0;
153     {
154       for(int i = 0; i < values.length; i++) {
155         if(values[i] != 0.0f) {
156           size++;
157         }
158       }
159     }
160     this.indexes = new int[size];
161     this.values = new float[size];
162 
163     // Copy the values
164     {
165       int pos = 0;
166       for(int i = 0; i < values.length; i++) {
167         float value = values[i];
168         if(value != 0.0f) {
169           this.indexes[pos] = i;
170           this.values[pos] = value;
171           pos++;
172         }
173       }
174     }
175   }
176 
177   @Override
getDimensionality()178   public int getDimensionality() {
179     return dimensionality;
180   }
181 
182   /**
183    * Sets the dimensionality to the new value.
184    *
185    *
186    * @param dimensionality the new dimensionality
187    * @throws IllegalArgumentException if the given dimensionality is too small
188    *         to cover the given values (i.e., the maximum index of any value not
189    *         zero is bigger than the given dimensionality)
190    */
191   @Override
setDimensionality(int dimensionality)192   public void setDimensionality(int dimensionality) throws IllegalArgumentException {
193     final int maxdim = getMaxDim();
194     if(maxdim > dimensionality) {
195       throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ").");
196     }
197     this.dimensionality = dimensionality;
198   }
199 
200   @Override
201   @Deprecated
getValue(int dimension)202   public Float getValue(int dimension) {
203     int pos = Arrays.binarySearch(this.indexes, dimension);
204     return (pos >= 0) ? values[pos] : FLOAT0;
205   }
206 
207   @Override
208   @Deprecated
doubleValue(int dimension)209   public double doubleValue(int dimension) {
210     int pos = Arrays.binarySearch(this.indexes, dimension);
211     return (pos >= 0) ? values[pos] : 0.;
212   }
213 
214   @Override
215   @Deprecated
floatValue(int dimension)216   public float floatValue(int dimension) {
217     int pos = Arrays.binarySearch(this.indexes, dimension);
218     return (pos >= 0) ? values[pos] : 0.f;
219   }
220 
221   @Override
222   @Deprecated
longValue(int dimension)223   public long longValue(int dimension) {
224     int pos = Arrays.binarySearch(this.indexes, dimension);
225     return (pos >= 0) ? (long) values[pos] : 0L;
226   }
227 
228   @Override
toArray()229   public double[] toArray() {
230     double[] vals = new double[dimensionality];
231     for(int i = 0; i < indexes.length; i++) {
232       vals[this.indexes[i]] = this.values[i];
233     }
234     return vals;
235   }
236 
237   /**
238    * Create a String representation of this SparseFloatVector as suitable for
239    * {@link de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser}
240    * .
241    *
242    * The returned String is a single line with entries separated by
243    * {@link NumberVector#ATTRIBUTE_SEPARATOR}. The first entry gives the
244    * number of values actually not zero. Following entries are pairs of Integer
245    * and Float where the Integer gives the index of the dimensionality and the
246    * Float gives the corresponding value.
247    *
248    * Example: a vector (0,1.2,1.3,0)<sup>T</sup> would result in the String<br>
249    * <code>2 2 1.2 3 1.3</code><br>
250    *
251    * @return a String representation of this SparseFloatVector
252    */
253   @Override
toString()254   public String toString() {
255     StringBuilder featureLine = new StringBuilder(15 * this.indexes.length)//
256         .append(this.indexes.length);
257     for(int i = 0; i < this.indexes.length; i++) {
258       featureLine.append(ATTRIBUTE_SEPARATOR).append(this.indexes[i])//
259           .append(ATTRIBUTE_SEPARATOR).append(this.values[i]);
260     }
261     return featureLine.toString();
262   }
263 
264   @Override
iterDim(int iter)265   public int iterDim(int iter) {
266     return indexes[iter];
267   }
268 
269   @Override
iterValid(int iter)270   public boolean iterValid(int iter) {
271     return iter < indexes.length;
272   }
273 
274   @Override
iterDoubleValue(int iter)275   public double iterDoubleValue(int iter) {
276     return (double) values[iter];
277   }
278 
279   @Override
iterFloatValue(int iter)280   public float iterFloatValue(int iter) {
281     return values[iter];
282   }
283 
284   @Override
iterLongValue(int iter)285   public long iterLongValue(int iter) {
286     return (long) values[iter];
287   }
288 
289   /**
290    * Factory class.
291    *
292    * @author Erich Schubert
293    *
294    * @has - - - SparseFloatVector
295    */
296   public static class Factory implements SparseNumberVector.Factory<SparseFloatVector> {
297     @Override
newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter)298     public <A> SparseFloatVector newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter) {
299       int dim = adapter.size(array);
300       float[] values = new float[dim];
301       for(int i = 0; i < dim; i++) {
302         values[i] = adapter.get(array, i).floatValue();
303       }
304       // TODO: inefficient
305       return new SparseFloatVector(values);
306     }
307 
308     @Override
newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter)309     public <A> SparseFloatVector newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter) {
310       int dim = adapter.size(array);
311       float[] values = new float[dim];
312       for(int i = 0; i < dim; i++) {
313         values[i] = adapter.getFloat(array, i);
314       }
315       // TODO: inefficient
316       return new SparseFloatVector(values);
317     }
318 
319     @Override
newNumberVector(Int2DoubleOpenHashMap dvalues, int maxdim)320     public SparseFloatVector newNumberVector(Int2DoubleOpenHashMap dvalues, int maxdim) {
321       int[] indexes = new int[dvalues.size()];
322       float[] values = new float[dvalues.size()];
323       // Import and sort the indexes
324       ObjectIterator<Int2DoubleMap.Entry> iter = dvalues.int2DoubleEntrySet().fastIterator();
325       for(int i = 0; iter.hasNext(); i++) {
326         indexes[i] = iter.next().getIntKey();
327       }
328       Arrays.sort(indexes);
329       // Import the values accordingly
330       for(int i = 0; i < dvalues.size(); i++) {
331         values[i] = (float) dvalues.get(indexes[i]);
332       }
333       return new SparseFloatVector(indexes, values, maxdim);
334     }
335 
336     @Override
getDefaultSerializer()337     public ByteBufferSerializer<SparseFloatVector> getDefaultSerializer() {
338       return VARIABLE_SERIALIZER;
339     }
340 
341     @Override
getRestrictionClass()342     public Class<? super SparseFloatVector> getRestrictionClass() {
343       return SparseFloatVector.class;
344     }
345 
346     /**
347      * Parameterization class.
348      *
349      * @author Erich Schubert
350      */
351     public static class Parameterizer extends AbstractParameterizer {
352       @Override
makeInstance()353       protected SparseFloatVector.Factory makeInstance() {
354         return FACTORY;
355       }
356     }
357   }
358 
359   /**
360    * Serialization class using VarInt encodings.
361    *
362    * @author Erich Schubert
363    *
364    * @assoc - serializes - SparseFloatVector
365    */
366   public static class VariableSerializer implements ByteBufferSerializer<SparseFloatVector> {
367     @Override
fromByteBuffer(ByteBuffer buffer)368     public SparseFloatVector fromByteBuffer(ByteBuffer buffer) throws IOException {
369       final int dimensionality = ByteArrayUtil.readUnsignedVarint(buffer);
370       final int nonzero = ByteArrayUtil.readUnsignedVarint(buffer);
371       final int[] dims = new int[nonzero];
372       final float[] values = new float[nonzero];
373       for(int i = 0; i < nonzero; i++) {
374         dims[i] = ByteArrayUtil.readUnsignedVarint(buffer);
375         values[i] = buffer.getFloat();
376       }
377       return new SparseFloatVector(dims, values, dimensionality);
378     }
379 
380     @Override
toByteBuffer(ByteBuffer buffer, SparseFloatVector vec)381     public void toByteBuffer(ByteBuffer buffer, SparseFloatVector vec) throws IOException {
382       ByteArrayUtil.writeUnsignedVarint(buffer, vec.dimensionality);
383       ByteArrayUtil.writeUnsignedVarint(buffer, vec.values.length);
384       for(int i = 0; i < vec.values.length; i++) {
385         ByteArrayUtil.writeUnsignedVarint(buffer, vec.indexes[i]);
386         buffer.putFloat(vec.values[i]);
387       }
388     }
389 
390     @Override
getByteSize(SparseFloatVector vec)391     public int getByteSize(SparseFloatVector vec) {
392       int sum = 0;
393       sum += ByteArrayUtil.getUnsignedVarintSize(vec.dimensionality);
394       sum += ByteArrayUtil.getUnsignedVarintSize(vec.values.length);
395       for(int d : vec.indexes) {
396         sum += ByteArrayUtil.getUnsignedVarintSize(d);
397       }
398       sum += vec.values.length * ByteArrayUtil.SIZE_FLOAT;
399       return sum;
400     }
401   }
402 }
403