1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.distance.distancefunction.external;
22 
23 import java.io.*;
24 
25 import de.lmu.ifi.dbs.elki.database.ids.DBID;
26 import de.lmu.ifi.dbs.elki.database.ids.DBIDRange;
27 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
28 import de.lmu.ifi.dbs.elki.database.relation.Relation;
29 import de.lmu.ifi.dbs.elki.distance.distancefunction.AbstractDBIDRangeDistanceFunction;
30 import de.lmu.ifi.dbs.elki.logging.Logging;
31 import de.lmu.ifi.dbs.elki.utilities.Alias;
32 import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
33 import de.lmu.ifi.dbs.elki.utilities.io.FileUtil;
34 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
35 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
36 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
37 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
38 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.FileParameter;
39 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
40 
41 import it.unimi.dsi.fastutil.longs.Long2FloatOpenHashMap;
42 
43 /**
44  * Distance function that is based on float distances given by a distance matrix
45  * of an external ASCII file.
46  * <p>
47  * Note: parsing an ASCII file is rather expensive.
48  * <p>
49  * See {@link AsciiDistanceParser} for the default input format.
50  * <p>
51  * TODO: use a {@code float[]} instead of the hash map?
52  *
53  * @author Elke Achtert
54  * @author Erich Schubert
55  * @since 0.1
56  *
57  * @composed - - - DistanceCacheWriter
58  */
59 @Alias("de.lmu.ifi.dbs.elki.distance.distancefunction.external.FileBasedFloatDistanceFunction")
60 public class FileBasedSparseFloatDistanceFunction extends AbstractDBIDRangeDistanceFunction {
61   /**
62    * Class logger.
63    */
64   private static final Logging LOG = Logging.getLogger(FileBasedSparseFloatDistanceFunction.class);
65 
66   /**
67    * The distance cache
68    */
69   private Long2FloatOpenHashMap cache;
70 
71   /**
72    * Distance parser
73    */
74   private DistanceParser parser;
75 
76   /**
77    * Input file of distance matrix
78    */
79   private File matrixfile;
80 
81   /**
82    * Minimum and maximum IDs seen.
83    */
84   private int min, max;
85 
86   /**
87    * Distance to return when not defined otherwise.
88    */
89   protected float defaultDistance = Float.POSITIVE_INFINITY;
90 
91   /**
92    * Constructor.
93    *
94    * @param parser Parser
95    * @param matrixfile input file
96    * @param defaultDistance Default distance (when undefined)
97    */
FileBasedSparseFloatDistanceFunction(DistanceParser parser, File matrixfile, float defaultDistance)98   public FileBasedSparseFloatDistanceFunction(DistanceParser parser, File matrixfile, float defaultDistance) {
99     super();
100     this.parser = parser;
101     this.matrixfile = matrixfile;
102     this.defaultDistance = defaultDistance;
103   }
104 
105   @Override
instantiate(Relation<O> relation)106   public <O extends DBID> DistanceQuery<O> instantiate(Relation<O> relation) {
107     if(cache == null) {
108       try {
109         loadCache(relation.size(), new BufferedInputStream(FileUtil.tryGzipInput(new FileInputStream(matrixfile))));
110       }
111       catch(IOException e) {
112         throw new AbortException("Could not load external distance file: " + matrixfile.toString(), e);
113       }
114     }
115     return super.instantiate(relation);
116   }
117 
118   @Override
distance(int i1, int i2)119   public double distance(int i1, int i2) {
120     return (i1 == i2) ? 0. : cache.get(makeKey(i1 + min, i2 + min));
121   }
122 
123   /**
124    * Fill cache from an input stream.
125    *
126    * @param size Expected size
127    * @param in Input stream
128    * @throws IOException
129    */
loadCache(int size, InputStream in)130   protected void loadCache(int size, InputStream in) throws IOException {
131     // Expect a sparse matrix here
132     cache = new Long2FloatOpenHashMap(size * 20);
133     cache.defaultReturnValue(Float.POSITIVE_INFINITY);
134     min = Integer.MAX_VALUE;
135     max = Integer.MIN_VALUE;
136     parser.parse(in, new DistanceCacheWriter() {
137       @Override
138       public void put(int id1, int id2, double distance) {
139         if(id1 < id2) {
140           min = id1 < min ? id1 : min;
141           max = id2 > max ? id2 : max;
142         }
143         else {
144           min = id2 < min ? id2 : min;
145           max = id1 > max ? id1 : max;
146         }
147         cache.put(makeKey(id1, id2), (float) distance);
148       }
149     });
150     if(min != 0) {
151       LOG.verbose("Distance matrix is supposed to be 0-indexed. Choosing offset " + min + " to compensate.");
152     }
153     if(max + 1 - min != size) {
154       LOG.warning("ID range is not consistent with relation size.");
155     }
156   }
157 
158   /**
159    * Combine two integer ids into a long value.
160    *
161    * @param i1 First id
162    * @param i2 Second id
163    * @return Combined value
164    */
makeKey(int i1, int i2)165   protected static final long makeKey(int i1, int i2) {
166     return (i1 < i2) //
167         ? ((((long) i1) << 32) | i2)//
168         : ((((long) i2) << 32) | i1);
169   }
170 
171   @Override
checkRange(DBIDRange range)172   public void checkRange(DBIDRange range) {
173     final int size = max + 1 - min;
174     if(size < range.size()) {
175       LOG.warning("Distance matrix has size " + size + " but range has size: " + range.size());
176     }
177   }
178 
179   @Override
equals(Object obj)180   public boolean equals(Object obj) {
181     if(obj == null) {
182       return false;
183     }
184     if(getClass() != obj.getClass()) {
185       return false;
186     }
187     FileBasedSparseFloatDistanceFunction other = (FileBasedSparseFloatDistanceFunction) obj;
188     return this.cache.equals(other.cache);
189   }
190 
191   /**
192    * Parameterization class.
193    *
194    * @author Erich Schubert
195    */
196   public static class Parameterizer extends AbstractParameterizer {
197     /**
198      * Parameter that specifies the name of the distance matrix file.
199      */
200     public static final OptionID MATRIX_ID = FileBasedSparseDoubleDistanceFunction.Parameterizer.MATRIX_ID;
201 
202     /**
203      * Optional parameter to specify the parsers to provide a database, must
204      * extend {@link DistanceParser}. If this parameter is not set,
205      * {@link AsciiDistanceParser} is used as parser for all input files.
206      */
207     public static final OptionID PARSER_ID = FileBasedSparseDoubleDistanceFunction.Parameterizer.PARSER_ID;
208 
209     /**
210      * Optional parameter to specify the distance to return when no distance was
211      * given in the file. Defaults to infinity.
212      */
213     public static final OptionID DEFAULTDIST_ID = FileBasedSparseDoubleDistanceFunction.Parameterizer.DEFAULTDIST_ID;
214 
215     /**
216      * Input file.
217      */
218     protected File matrixfile = null;
219 
220     /**
221      * Parser for input file.
222      */
223     protected DistanceParser parser = null;
224 
225     /**
226      * Distance to return when not defined otherwise.
227      */
228     protected float defaultDistance = Float.POSITIVE_INFINITY;
229 
230     @Override
makeOptions(Parameterization config)231     protected void makeOptions(Parameterization config) {
232       super.makeOptions(config);
233       final FileParameter MATRIX_PARAM = new FileParameter(MATRIX_ID, FileParameter.FileType.INPUT_FILE);
234       if(config.grab(MATRIX_PARAM)) {
235         matrixfile = MATRIX_PARAM.getValue();
236       }
237 
238       final ObjectParameter<DistanceParser> PARSER_PARAM = new ObjectParameter<>(PARSER_ID, DistanceParser.class, AsciiDistanceParser.class);
239       if(config.grab(PARSER_PARAM)) {
240         parser = PARSER_PARAM.instantiateClass(config);
241       }
242 
243       DoubleParameter distanceP = new DoubleParameter(DEFAULTDIST_ID, Double.POSITIVE_INFINITY);
244       if(config.grab(distanceP)) {
245         defaultDistance = (float) distanceP.doubleValue();
246       }
247     }
248 
249     @Override
makeInstance()250     protected FileBasedSparseFloatDistanceFunction makeInstance() {
251       return new FileBasedSparseFloatDistanceFunction(parser, matrixfile, defaultDistance);
252     }
253   }
254 }
255