1 import libsvm.*; 2 import java.io.*; 3 import java.util.*; 4 import java.text.DecimalFormat; 5 6 class svm_scale 7 { 8 private String line = null; 9 private double lower = -1.0; 10 private double upper = 1.0; 11 private double y_lower; 12 private double y_upper; 13 private boolean y_scaling = false; 14 private double[] feature_max; 15 private double[] feature_min; 16 private double y_max = -Double.MAX_VALUE; 17 private double y_min = Double.MAX_VALUE; 18 private int max_index; 19 private long num_nonzeros = 0; 20 private long new_num_nonzeros = 0; 21 exit_with_help()22 private static void exit_with_help() 23 { 24 System.out.print( 25 "Usage: svm-scale [options] data_filename\n" 26 +"options:\n" 27 +"-l lower : x scaling lower limit (default -1)\n" 28 +"-u upper : x scaling upper limit (default +1)\n" 29 +"-y y_lower y_upper : y scaling limits (default: no y scaling)\n" 30 +"-s save_filename : save scaling parameters to save_filename\n" 31 +"-r restore_filename : restore scaling parameters from restore_filename\n" 32 ); 33 System.exit(1); 34 } 35 rewind(BufferedReader fp, String filename)36 private BufferedReader rewind(BufferedReader fp, String filename) throws IOException 37 { 38 fp.close(); 39 return new BufferedReader(new FileReader(filename)); 40 } 41 output_target(double value)42 private void output_target(double value) 43 { 44 if(y_scaling) 45 { 46 if(value == y_min) 47 value = y_lower; 48 else if(value == y_max) 49 value = y_upper; 50 else 51 value = y_lower + (y_upper-y_lower) * 52 (value-y_min) / (y_max-y_min); 53 } 54 55 System.out.print(value + " "); 56 } 57 output(int index, double value)58 private void output(int index, double value) 59 { 60 /* skip single-valued attribute */ 61 if(feature_max[index] == feature_min[index]) 62 return; 63 64 if(value == feature_min[index]) 65 value = lower; 66 else if(value == feature_max[index]) 67 value = upper; 68 else 69 value = lower + (upper-lower) * 70 (value-feature_min[index])/ 71 (feature_max[index]-feature_min[index]); 72 73 if(value != 0) 74 { 75 System.out.print(index + ":" + value + " "); 76 new_num_nonzeros++; 77 } 78 } 79 readline(BufferedReader fp)80 private String readline(BufferedReader fp) throws IOException 81 { 82 line = fp.readLine(); 83 return line; 84 } 85 run(String []argv)86 private void run(String []argv) throws IOException 87 { 88 int i,index; 89 BufferedReader fp = null, fp_restore = null; 90 String save_filename = null; 91 String restore_filename = null; 92 String data_filename = null; 93 94 95 for(i=0;i<argv.length;i++) 96 { 97 if (argv[i].charAt(0) != '-') break; 98 ++i; 99 switch(argv[i-1].charAt(1)) 100 { 101 case 'l': lower = Double.parseDouble(argv[i]); break; 102 case 'u': upper = Double.parseDouble(argv[i]); break; 103 case 'y': 104 y_lower = Double.parseDouble(argv[i]); 105 ++i; 106 y_upper = Double.parseDouble(argv[i]); 107 y_scaling = true; 108 break; 109 case 's': save_filename = argv[i]; break; 110 case 'r': restore_filename = argv[i]; break; 111 default: 112 System.err.println("unknown option"); 113 exit_with_help(); 114 } 115 } 116 117 if(!(upper > lower) || (y_scaling && !(y_upper > y_lower))) 118 { 119 System.err.println("inconsistent lower/upper specification"); 120 System.exit(1); 121 } 122 if(restore_filename != null && save_filename != null) 123 { 124 System.err.println("cannot use -r and -s simultaneously"); 125 System.exit(1); 126 } 127 128 if(argv.length != i+1) 129 exit_with_help(); 130 131 data_filename = argv[i]; 132 try { 133 fp = new BufferedReader(new FileReader(data_filename)); 134 } catch (Exception e) { 135 System.err.println("can't open file " + data_filename); 136 System.exit(1); 137 } 138 139 /* assumption: min index of attributes is 1 */ 140 /* pass 1: find out max index of attributes */ 141 max_index = 0; 142 143 if(restore_filename != null) 144 { 145 int idx, c; 146 147 try { 148 fp_restore = new BufferedReader(new FileReader(restore_filename)); 149 } 150 catch (Exception e) { 151 System.err.println("can't open file " + restore_filename); 152 System.exit(1); 153 } 154 if((c = fp_restore.read()) == 'y') 155 { 156 fp_restore.readLine(); 157 fp_restore.readLine(); 158 fp_restore.readLine(); 159 } 160 fp_restore.readLine(); 161 fp_restore.readLine(); 162 163 String restore_line = null; 164 while((restore_line = fp_restore.readLine())!=null) 165 { 166 StringTokenizer st2 = new StringTokenizer(restore_line); 167 idx = Integer.parseInt(st2.nextToken()); 168 max_index = Math.max(max_index, idx); 169 } 170 fp_restore = rewind(fp_restore, restore_filename); 171 } 172 173 while (readline(fp) != null) 174 { 175 StringTokenizer st = new StringTokenizer(line," \t\n\r\f:"); 176 st.nextToken(); 177 while(st.hasMoreTokens()) 178 { 179 index = Integer.parseInt(st.nextToken()); 180 max_index = Math.max(max_index, index); 181 st.nextToken(); 182 num_nonzeros++; 183 } 184 } 185 186 try { 187 feature_max = new double[(max_index+1)]; 188 feature_min = new double[(max_index+1)]; 189 } catch(OutOfMemoryError e) { 190 System.err.println("can't allocate enough memory"); 191 System.exit(1); 192 } 193 194 for(i=0;i<=max_index;i++) 195 { 196 feature_max[i] = -Double.MAX_VALUE; 197 feature_min[i] = Double.MAX_VALUE; 198 } 199 200 fp = rewind(fp, data_filename); 201 202 /* pass 2: find out min/max value */ 203 while(readline(fp) != null) 204 { 205 int next_index = 1; 206 double target; 207 double value; 208 209 StringTokenizer st = new StringTokenizer(line," \t\n\r\f:"); 210 target = Double.parseDouble(st.nextToken()); 211 y_max = Math.max(y_max, target); 212 y_min = Math.min(y_min, target); 213 214 while (st.hasMoreTokens()) 215 { 216 index = Integer.parseInt(st.nextToken()); 217 value = Double.parseDouble(st.nextToken()); 218 219 for (i = next_index; i<index; i++) 220 { 221 feature_max[i] = Math.max(feature_max[i], 0); 222 feature_min[i] = Math.min(feature_min[i], 0); 223 } 224 225 feature_max[index] = Math.max(feature_max[index], value); 226 feature_min[index] = Math.min(feature_min[index], value); 227 next_index = index + 1; 228 } 229 230 for(i=next_index;i<=max_index;i++) 231 { 232 feature_max[i] = Math.max(feature_max[i], 0); 233 feature_min[i] = Math.min(feature_min[i], 0); 234 } 235 } 236 237 fp = rewind(fp, data_filename); 238 239 /* pass 2.5: save/restore feature_min/feature_max */ 240 if(restore_filename != null) 241 { 242 // fp_restore rewinded in finding max_index 243 int idx, c; 244 double fmin, fmax; 245 246 fp_restore.mark(2); // for reset 247 if((c = fp_restore.read()) == 'y') 248 { 249 fp_restore.readLine(); // pass the '\n' after 'y' 250 StringTokenizer st = new StringTokenizer(fp_restore.readLine()); 251 y_lower = Double.parseDouble(st.nextToken()); 252 y_upper = Double.parseDouble(st.nextToken()); 253 st = new StringTokenizer(fp_restore.readLine()); 254 y_min = Double.parseDouble(st.nextToken()); 255 y_max = Double.parseDouble(st.nextToken()); 256 y_scaling = true; 257 } 258 else 259 fp_restore.reset(); 260 261 if(fp_restore.read() == 'x') { 262 fp_restore.readLine(); // pass the '\n' after 'x' 263 StringTokenizer st = new StringTokenizer(fp_restore.readLine()); 264 lower = Double.parseDouble(st.nextToken()); 265 upper = Double.parseDouble(st.nextToken()); 266 String restore_line = null; 267 while((restore_line = fp_restore.readLine())!=null) 268 { 269 StringTokenizer st2 = new StringTokenizer(restore_line); 270 idx = Integer.parseInt(st2.nextToken()); 271 fmin = Double.parseDouble(st2.nextToken()); 272 fmax = Double.parseDouble(st2.nextToken()); 273 if (idx <= max_index) 274 { 275 feature_min[idx] = fmin; 276 feature_max[idx] = fmax; 277 } 278 } 279 } 280 fp_restore.close(); 281 } 282 283 if(save_filename != null) 284 { 285 Formatter formatter = new Formatter(new StringBuilder()); 286 BufferedWriter fp_save = null; 287 288 try { 289 fp_save = new BufferedWriter(new FileWriter(save_filename)); 290 } catch(IOException e) { 291 System.err.println("can't open file " + save_filename); 292 System.exit(1); 293 } 294 295 if(y_scaling) 296 { 297 formatter.format("y\n"); 298 formatter.format("%.16g %.16g\n", y_lower, y_upper); 299 formatter.format("%.16g %.16g\n", y_min, y_max); 300 } 301 formatter.format("x\n"); 302 formatter.format("%.16g %.16g\n", lower, upper); 303 for(i=1;i<=max_index;i++) 304 { 305 if(feature_min[i] != feature_max[i]) 306 formatter.format("%d %.16g %.16g\n", i, feature_min[i], feature_max[i]); 307 } 308 fp_save.write(formatter.toString()); 309 fp_save.close(); 310 } 311 312 /* pass 3: scale */ 313 while(readline(fp) != null) 314 { 315 int next_index = 1; 316 double target; 317 double value; 318 319 StringTokenizer st = new StringTokenizer(line," \t\n\r\f:"); 320 target = Double.parseDouble(st.nextToken()); 321 output_target(target); 322 while(st.hasMoreElements()) 323 { 324 index = Integer.parseInt(st.nextToken()); 325 value = Double.parseDouble(st.nextToken()); 326 for (i = next_index; i<index; i++) 327 output(i, 0); 328 output(index, value); 329 next_index = index + 1; 330 } 331 332 for(i=next_index;i<= max_index;i++) 333 output(i, 0); 334 System.out.print("\n"); 335 } 336 if (new_num_nonzeros > num_nonzeros) 337 System.err.print( 338 "WARNING: original #nonzeros " + num_nonzeros+"\n" 339 +" new #nonzeros " + new_num_nonzeros+"\n" 340 +"Use -l 0 if many original feature values are zeros\n"); 341 342 fp.close(); 343 } 344 main(String argv[])345 public static void main(String argv[]) throws IOException 346 { 347 svm_scale s = new svm_scale(); 348 s.run(argv); 349 } 350 } 351