1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.data;
22 
23 import java.io.IOException;
24 import java.nio.ByteBuffer;
25 import java.util.Arrays;
26 
27 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter;
28 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
29 import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil;
30 import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer;
31 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
32 
33 import it.unimi.dsi.fastutil.ints.Int2DoubleMap;
34 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap;
35 import it.unimi.dsi.fastutil.objects.ObjectIterator;
36 
37 /**
38  * Sparse vector type, using {@code byte[]} for storing the values, and
39  * {@code int[]} for storing the indexes, approximately 5 bytes per non-zero
40  * value (limited to -128..+127).
41  *
42  * @author Arthur Zimek
43  * @since 0.6.0
44  */
45 public class SparseByteVector implements SparseNumberVector {
46   /**
47    * Static instance.
48    */
49   public static final SparseByteVector.Factory FACTORY = new SparseByteVector.Factory();
50 
51   /**
52    * Serializer using varint encoding.
53    */
54   public static final ByteBufferSerializer<SparseByteVector> VARIABLE_SERIALIZER = new VariableSerializer();
55 
56   /**
57    * Indexes of values.
58    */
59   private final int[] indexes;
60 
61   /**
62    * Stored values.
63    */
64   private final byte[] values;
65 
66   /**
67    * The dimensionality of this feature vector.
68    */
69   private int dimensionality;
70 
71   /**
72    * Direct constructor.
73    *
74    * @param indexes Indexes Must be sorted!
75    * @param values Associated value.
76    * @param dimensionality "true" dimensionality
77    */
SparseByteVector(int[] indexes, byte[] values, int dimensionality)78   public SparseByteVector(int[] indexes, byte[] values, int dimensionality) {
79     super();
80     this.indexes = indexes;
81     this.values = values;
82     this.dimensionality = dimensionality;
83   }
84 
85   /**
86    * Create a SparseByteVector consisting of double values according to the
87    * specified mapping of indices and values.
88    *
89    * @param values the values to be set as values of the real vector
90    * @param dimensionality the dimensionality of this feature vector
91    * @throws IllegalArgumentException if the given dimensionality is too small
92    *         to cover the given values (i.e., the maximum index of any value not
93    *         zero is bigger than the given dimensionality)
94    */
SparseByteVector(Int2DoubleOpenHashMap values, int dimensionality)95   public SparseByteVector(Int2DoubleOpenHashMap values, int dimensionality) throws IllegalArgumentException {
96     if(values.size() > dimensionality) {
97       throw new IllegalArgumentException("values.size() > dimensionality!");
98     }
99 
100     this.indexes = new int[values.size()];
101     this.values = new byte[values.size()];
102     // Import and sort the indexes
103     {
104       ObjectIterator<Int2DoubleMap.Entry> iter = values.int2DoubleEntrySet().fastIterator();
105       for(int i = 0; iter.hasNext(); i++) {
106         this.indexes[i] = iter.next().getIntKey();
107       }
108       Arrays.sort(this.indexes);
109     }
110     // Import the values accordingly
111     {
112       for(int i = 0; i < values.size(); i++) {
113         this.values[i] = (byte) values.get(this.indexes[i]);
114       }
115     }
116     this.dimensionality = dimensionality;
117     final int maxdim = getMaxDim();
118     if(maxdim > dimensionality) {
119       throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ").");
120     }
121   }
122 
123   /**
124    * Get the maximum dimensionality.
125    *
126    * @return the maximum dimensionality seen
127    */
getMaxDim()128   private int getMaxDim() {
129     return (this.indexes.length == 0) ? 0 : this.indexes[this.indexes.length - 1];
130   }
131 
132   /**
133    * Create a SparseByteVector consisting of double values according to the
134    * specified mapping of indices and values.
135    *
136    * @param values the values to be set as values of the real vector
137    * @throws IllegalArgumentException if the given dimensionality is too small
138    *         to cover the given values (i.e., the maximum index of any value not
139    *         zero is bigger than the given dimensionality)
140    */
SparseByteVector(byte[] values)141   public SparseByteVector(byte[] values) throws IllegalArgumentException {
142     this.dimensionality = values.length;
143 
144     // Count the number of non-zero entries
145     int size = 0;
146     {
147       for(int i = 0; i < values.length; i++) {
148         if(values[i] != 0) {
149           size++;
150         }
151       }
152     }
153     this.indexes = new int[size];
154     this.values = new byte[size];
155 
156     // Copy the values
157     {
158       int pos = 0;
159       for(int i = 0; i < values.length; i++) {
160         byte value = values[i];
161         if(value != 0) {
162           this.indexes[pos] = i;
163           this.values[pos] = value;
164           pos++;
165         }
166       }
167     }
168   }
169 
170   @Override
getDimensionality()171   public int getDimensionality() {
172     return dimensionality;
173   }
174 
175   /**
176    * Sets the dimensionality to the new value.
177    *
178    *
179    * @param dimensionality the new dimensionality
180    * @throws IllegalArgumentException if the given dimensionality is too small
181    *         to cover the given values (i.e., the maximum index of any value not
182    *         zero is bigger than the given dimensionality)
183    */
184   @Override
setDimensionality(int dimensionality)185   public void setDimensionality(int dimensionality) throws IllegalArgumentException {
186     final int maxdim = getMaxDim();
187     if(maxdim > dimensionality) {
188       throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ").");
189     }
190     this.dimensionality = dimensionality;
191   }
192 
193   /**
194    * Value to return for missing entires.
195    */
196   private static final Byte BYTE0 = Byte.valueOf((byte) 0);
197 
198   @Override
199   @Deprecated
getValue(int dimension)200   public Byte getValue(int dimension) {
201     int pos = Arrays.binarySearch(this.indexes, dimension);
202     return (pos >= 0) ? values[pos] : BYTE0;
203   }
204 
205   @Override
206   @Deprecated
doubleValue(int dimension)207   public double doubleValue(int dimension) {
208     int pos = Arrays.binarySearch(this.indexes, dimension);
209     return (pos >= 0) ? values[pos] : 0.;
210   }
211 
212   @Override
213   @Deprecated
longValue(int dimension)214   public long longValue(int dimension) {
215     int pos = Arrays.binarySearch(this.indexes, dimension);
216     return (pos >= 0) ? values[pos] : 0L;
217   }
218 
219   @Override
byteValue(int dimension)220   public byte byteValue(int dimension) {
221     int pos = Arrays.binarySearch(this.indexes, dimension);
222     return pos >= 0 ? values[pos] : 0;
223   }
224 
225   @Override
toArray()226   public double[] toArray() {
227     double[] vals = new double[dimensionality];
228     for(int i = 0; i < indexes.length; i++) {
229       vals[this.indexes[i]] = this.values[i];
230     }
231     return vals;
232   }
233 
234   /**
235    * Create a String representation of this SparseByteVector as suitable for
236    * {@link de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser}
237    * .
238    * <p>
239    * The returned String is a single line with entries separated by
240    * {@link NumberVector#ATTRIBUTE_SEPARATOR}. The first entry gives the
241    * number of values actually not zero. Following entries are pairs of Byte and
242    * Byte where the Byte gives the index of the dimensionality and the Byte
243    * gives the corresponding value.
244    * <p>
245    * Example: a vector (0,1.2,1.3,0)<sup>T</sup> would result in the String<br>
246    * <code>2 2 1.2 3 1.3</code>
247    *
248    * @return a String representation of this SparseByteVector
249    */
250   @Override
toString()251   public String toString() {
252     StringBuilder featureLine = new StringBuilder(8 * this.indexes.length)//
253         .append(this.indexes.length);
254     for(int i = 0; i < this.indexes.length; i++) {
255       featureLine.append(ATTRIBUTE_SEPARATOR).append(this.indexes[i])//
256           .append(ATTRIBUTE_SEPARATOR).append(this.values[i]);
257     }
258     return featureLine.toString();
259   }
260 
261   @Override
iterDim(int iter)262   public int iterDim(int iter) {
263     return indexes[iter];
264   }
265 
266   @Override
iterValid(int iter)267   public boolean iterValid(int iter) {
268     return iter < indexes.length;
269   }
270 
271   @Override
iterDoubleValue(int iter)272   public double iterDoubleValue(int iter) {
273     return (double) values[iter];
274   }
275 
276   @Override
iterLongValue(int iter)277   public long iterLongValue(int iter) {
278     return (long) values[iter];
279   }
280 
281   @Override
iterByteValue(int iter)282   public byte iterByteValue(int iter) {
283     return values[iter];
284   }
285 
286   /**
287    * Factory class.
288    *
289    * @author Erich Schubert
290    *
291    * @has - - - SparseByteVector
292    */
293   public static class Factory implements SparseNumberVector.Factory<SparseByteVector> {
294     @Override
newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter)295     public <A> SparseByteVector newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter) {
296       int dim = adapter.size(array);
297       byte[] values = new byte[dim];
298       for(int i = 0; i < dim; i++) {
299         values[i] = adapter.get(array, i).byteValue();
300       }
301       // TODO: improve efficiency
302       return new SparseByteVector(values);
303     }
304 
305     @Override
newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter)306     public <A> SparseByteVector newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter) {
307       int dim = adapter.size(array);
308       byte[] values = new byte[dim];
309       for(int i = 0; i < dim; i++) {
310         values[i] = adapter.getByte(array, i);
311       }
312       // TODO: improve efficiency
313       return new SparseByteVector(values);
314     }
315 
316     @Override
newNumberVector(Int2DoubleOpenHashMap values, int maxdim)317     public SparseByteVector newNumberVector(Int2DoubleOpenHashMap values, int maxdim) {
318       return new SparseByteVector(values, maxdim);
319     }
320 
321     @Override
getDefaultSerializer()322     public ByteBufferSerializer<SparseByteVector> getDefaultSerializer() {
323       return VARIABLE_SERIALIZER;
324     }
325 
326     @Override
getRestrictionClass()327     public Class<? super SparseByteVector> getRestrictionClass() {
328       return SparseByteVector.class;
329     }
330 
331     /**
332      * Parameterization class.
333      *
334      * @author Erich Schubert
335      */
336     public static class Parameterizer extends AbstractParameterizer {
337       @Override
makeInstance()338       protected SparseByteVector.Factory makeInstance() {
339         return FACTORY;
340       }
341     }
342   }
343 
344   /**
345    * Serialization class using VarInt encodings.
346    *
347    * @author Erich Schubert
348    *
349    * @assoc - serializes - SparseByteVector
350    */
351   public static class VariableSerializer implements ByteBufferSerializer<SparseByteVector> {
352     @Override
fromByteBuffer(ByteBuffer buffer)353     public SparseByteVector fromByteBuffer(ByteBuffer buffer) throws IOException {
354       final int dimensionality = ByteArrayUtil.readUnsignedVarint(buffer);
355       final int nonzero = ByteArrayUtil.readUnsignedVarint(buffer);
356       final int[] dims = new int[nonzero];
357       final byte[] values = new byte[nonzero];
358       for(int i = 0; i < nonzero; i++) {
359         dims[i] = ByteArrayUtil.readUnsignedVarint(buffer);
360         values[i] = buffer.get();
361       }
362       return new SparseByteVector(dims, values, dimensionality);
363     }
364 
365     @Override
toByteBuffer(ByteBuffer buffer, SparseByteVector vec)366     public void toByteBuffer(ByteBuffer buffer, SparseByteVector vec) throws IOException {
367       ByteArrayUtil.writeUnsignedVarint(buffer, vec.dimensionality);
368       ByteArrayUtil.writeUnsignedVarint(buffer, vec.values.length);
369       for(int i = 0; i < vec.values.length; i++) {
370         ByteArrayUtil.writeUnsignedVarint(buffer, vec.indexes[i]);
371         buffer.put(vec.values[i]);
372       }
373     }
374 
375     @Override
getByteSize(SparseByteVector vec)376     public int getByteSize(SparseByteVector vec) {
377       int sum = 0;
378       sum += ByteArrayUtil.getUnsignedVarintSize(vec.dimensionality);
379       sum += ByteArrayUtil.getUnsignedVarintSize(vec.values.length);
380       for(int i = 0; i < vec.values.length; i++) {
381         sum += ByteArrayUtil.getUnsignedVarintSize(vec.indexes[i]);
382         ++sum;
383       }
384       return sum;
385     }
386   }
387 }
388