1 /*-----------------------------------------------------------------------
2  * linclassif.c: Implementation of linear classification rule classifying
3  *  examples from the SVM^light format.
4  *
5  * Copyright (C) 2008-2012 Vojtech Franc, xfrancv@cmp.felk.cvut.cz
6  *              Soeren Sonnenburg, soeren.sonnenburg@first.fraunhofer.de
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public
10  * License as published by the Free Software Foundation;
11  *-------------------------------------------------------------------- */
12 
13 
14 #include <stdio.h>
15 #include <string.h>
16 #include <stdint.h>
17 #include <math.h>
18 #include <stdlib.h>
19 #include <stdint.h>
20 
21 #include "lib_svmlight_format.h"
22 #include "libocas.h"
23 #include "version.h"
24 
25 #define MODELFILE_MAXLINELEN 1000000
26 
print_usage(void)27 void print_usage(void)
28 {
29   printf("LINCLASSIF: Predict labels by linear classication rule\n"
30          "          " OCAS_VERSION "\n"
31          "\n"
32          "   usage: linclass [options] example_file model_file\n"
33          "\n"
34          "Arguments:\n"
35          "         example_file    text file with testing examples stored in SVM^light format\n"
36          "         model_file      text file which contains either binary (two-class) linear\n"
37          "                         rule f(x)=w'*x+w0 or multi-class rule f(x)=W'*x which are\n"
38          "                         produced by svmocas and msvmocas, respectively\n"
39          "Options:\n"
40          "         -e              prints classification error computed from predicted\n"
41          "                         labels and labels contained in the example_file.\n"
42          "         -h              this help\n"
43          "         -o output_file  save predictions to output_file rather than to stdout.\n"
44          "         -t [0,1]        output type: 0 .. predicted labels (default)\n"
45          "                                      1 .. discriminant values\n"
46          "         -v [0,1]        verbosity level (default 0).\n"
47          "\n"
48          "Examples\n"
49          "  Train SVM classifier from riply_trn.light with regularization constant C = 10,\n"
50          "  bias switched on, verbosity switched off and model saved to svmocas.model\n"
51          "    ./svmocas -c 10 -b 1 -v 0 ./data/riply_trn.light ./data/svmocas.model \n"
52          "\n"
53          "  Compute testing error of the classifier stored in svmocas.model using testing\n"
54          "  examples from riply_tst.light and save predicted labels to riply_tst.pred\n"
55          "    ./linclassif -e -o ./data/riply_tst.pred ./data/riply_tst.light ./data/svmocas.model\n"
56          "\n"
57          );
58 }
59 
main(int argc,char * argv[])60 int main(int argc, char *argv[])
61 {
62   uint32_t i, j;
63   int len;
64   int recognized;
65   int exitflag = 1;
66   int verb;
67   int binary_problem;
68   int output_type;
69   int print_error;
70   char *line;
71 
72   char *model_fname;
73   char *input_fname;
74   char *output_fname;
75   FILE *fid, *fout;
76 
77   double *feat_val;
78   uint32_t *feat_idx;
79 
80   int go = 1;
81   char *endptr, *begptr;
82   int nLines = 0;
83   int nCols = 0, tmp_nCols;
84   double val;
85   double *W;
86   double W0;
87   uint32_t nY, nDim;
88 
89   long line_cnt = 0;
90   int label, pred_label=0;
91   long max_dim = 0;
92   long nnzf;
93   double dfce, max_dfce;
94   long nErrors = 0;
95   long *nClassErrors, *nClass;
96 
97   /* init */
98   fid = NULL;
99   fout = NULL;
100   feat_val = NULL;
101   feat_idx = NULL;
102   W = NULL;
103   W0 = 0;
104   input_fname = NULL;
105   model_fname = NULL;
106   output_fname = NULL;
107   nClassErrors = NULL;
108   nClass = NULL;
109 
110   /* default setting of input arguments*/
111   verb = 0;
112   print_error = 0;
113   output_type = 0;
114 
115   /* Allocate memory */
116   line = calloc(MODELFILE_MAXLINELEN, sizeof(char));
117   if( line == NULL )
118   {
119     fprintf(stderr,"Not enough memmory to allocate line buffer.\n");
120     goto clean_up;
121   }
122 
123   feat_idx = calloc(LIBSLF_MAXLINELEN, sizeof(uint32_t));
124   if( feat_idx == NULL )
125   {
126     fprintf(stderr,"Not enough memmory to allocate feat_idx.\n");
127     goto clean_up;
128   }
129 
130   feat_val = calloc(LIBSLF_MAXLINELEN, sizeof(double));
131   if( feat_val == NULL )
132   {
133     fprintf(stderr,"Not enough memmory to allocate feat_val.\n");
134     goto clean_up;
135   }
136 
137 
138   /*-----------------------------------------------------------
139     Process input arguments
140   ------------------------------------------------------------*/
141   if(argc ==1 || strcmp(argv[1], "-h") == 0)
142   {
143     print_usage();
144     goto clean_up;
145   }
146 
147   if(argc < 2)
148   {
149     fprintf(stderr,"Not enough input arguments.\n\n");
150     goto clean_up;
151   }
152 
153 
154   for (i = 1; i < argc-2; i++)
155   {
156     recognized = 0;
157     if (strcmp(argv[i], "-h") == 0)
158     {
159       print_usage();
160       goto clean_up;
161     }
162 
163     if (strcmp(argv[i], "-e") == 0)
164     {
165       print_error = 1;
166       recognized = 1;
167       continue;
168     }
169 
170     if (strcmp(argv[i], "-v") == 0)
171     {
172       if(i+1 >= argc-2)
173       {
174         fprintf(stderr,"You have to specify a value after argument -v\n");
175         goto clean_up;
176       }
177       verb = atoi(argv[i+1]);
178       if(verb < 0 || verb > 1)
179       {
180         fprintf(stderr,"A value after the argument -v must be either 0 or 1.\n");
181         goto clean_up;
182       }
183 
184       i++;
185       recognized = 1;
186       continue;
187     }
188 
189     if (strcmp(argv[i], "-t") == 0)
190     {
191       if(i+1 >= argc-2)
192       {
193         fprintf(stderr,"You have to specify a value after argument -t\n");
194         goto clean_up;
195       }
196       output_type = atoi(argv[i+1]);
197       if(output_type != 0 && output_type != 1)
198       {
199         fprintf(stderr,"A value after the argument -t must be either 0 or 1.\n");
200         goto clean_up;
201       }
202 
203       i++;
204       recognized = 1;
205       continue;
206     }
207 
208     if (strcmp(argv[i], "-o") == 0)
209     {
210       if(i+1 >= argc-2)
211       {
212         fprintf(stderr,"You have to specify a string after argument -o\n");
213         goto clean_up;
214       }
215 
216       len = strlen(argv[i+1]);
217       output_fname = calloc(len+1,sizeof(char));
218       strcpy(output_fname, argv[i+1]);
219 
220       i++;
221       recognized = 1;
222       continue;
223     }
224 
225     if(recognized == 0)
226     {
227       fprintf(stderr,"Unknown input argument: %s\n", argv[i]);
228       goto clean_up;
229     }
230 
231   }
232 
233   len = strlen(argv[argc-2]);
234   input_fname = calloc(len+1,sizeof(char));
235   strcpy(input_fname, argv[argc-2]);
236 
237   len = strlen(argv[argc-1]);
238   model_fname = calloc(len+1,sizeof(char));
239   strcpy(model_fname, argv[argc-1]);
240 
241   if(verb)
242   {
243     printf("Verbosity: %d\n", verb);
244     printf("Output type: %d\n", output_type);
245     printf("Print error: %d\n", print_error);
246     printf("Example file: %s\n", input_fname);
247     printf("Model file: %s\n", model_fname);
248     if( output_fname != NULL)
249       printf("Output file: %s\n", output_fname);
250     else
251       printf("Output file: stdout\n");
252   }
253 
254 
255   /*----------------------------------------------------------------
256     Load classification rule which is either
257      vector [nDim x 1] + bias [1x1]
258     or
259      matrix [nDim x nY]
260   -------------------------------------------------------------------*/
261 
262   /* load W from model file */
263   fid = fopen(model_fname, "r");
264   if(fid == NULL) {
265     fprintf(stderr,"Cannot open model file.\n");
266     perror("fopen error ");
267     goto clean_up;
268   }
269 
270   if(verb)
271   {
272     printf("Analysing model file... ");
273     fflush(stdout);
274   }
275 
276   /* read the first line */
277   if(fgets(line,LIBSLF_MAXLINELEN, fid) == NULL )
278   {
279     fprintf(stderr,"Empty example file.\n");
280     fclose(fid);
281     goto clean_up;
282   }
283   else
284   {
285     nLines = 1;
286     begptr = line;
287     while(1)
288     {
289       val = strtod(begptr, &endptr);
290 
291       if(val == 0 && begptr == endptr)
292         break;
293 
294       nCols++;
295       begptr = endptr;
296     }
297   }
298 
299   go = 1;
300   while(go)
301   {
302     begptr = line;
303 
304     tmp_nCols = 0;
305     while(1)
306     {
307       val = strtod(begptr, &endptr);
308 
309       if(val == 0 && begptr == endptr)
310         break;
311 
312       tmp_nCols++;
313       begptr = endptr;
314     }
315     if( tmp_nCols != nCols)
316     {
317       fprintf(stderr,"Error: Model file contains lines with different number of colums.\n");
318       fclose(fid);
319       goto clean_up;
320     }
321 
322     if(fgets(line,LIBSLF_MAXLINELEN, fid) == NULL )
323     {
324       go = 0;
325     }
326     else
327       nLines++;
328   }
329 
330   if(verb)
331     printf("done.\n"
332            "Number of lines: %d\n"
333            "Number of columns: %d\n",
334            nLines,nCols);
335 
336   if(nCols == 1)
337   {
338     nY = 2;
339     nDim = nLines-1;
340     binary_problem = 1;
341 
342     /* learned weight vector */
343     W = (double*)calloc(nDim,sizeof(double));
344     if(W == NULL)
345     {
346       fprintf(stderr,"Not enough memory for vector W.\n");
347       fclose(fid);
348       goto clean_up;
349     }
350 
351     if(verb)
352     {
353       printf("Model file contains binary classification rule.\n");
354       printf("Reading model file...");
355     }
356 
357     fseek(fid,0,SEEK_SET);
358     for(i=0; i <= nDim; i++)
359     {
360       if(fgets(line,LIBSLF_MAXLINELEN, fid) == NULL )
361       {
362         fprintf(stderr,"Model file corrupted.\n");
363 	fclose(fid);
364         goto clean_up;
365       }
366 
367       begptr = line;
368       val = strtod(begptr, &endptr);
369 
370       if(val == 0 && begptr == endptr)
371       {
372         fprintf(stderr,"Model file corrupted.\n");
373 	fclose(fid);
374         goto clean_up;
375       }
376 
377       if(i < nDim)
378         W[i] = val;
379       else
380         W0 = val;
381     }
382 
383     if(verb)
384       printf("done.\n");
385 
386   }
387   else
388   {
389     nY = nCols;
390     nDim = nLines;
391     binary_problem = 0;
392 
393     /* learned weight vector */
394     W = (double*)calloc(nDim*nY,sizeof(double));
395     if(W == NULL)
396     {
397       fprintf(stderr,"Not enough memory for matrix W.\n");
398       fclose(fid);
399       goto clean_up;
400     }
401 
402     if(verb)
403     {
404       printf("Model file contains multi-class classification rule.\n");
405       printf("Reading model file...");
406     }
407 
408     fseek(fid,0,SEEK_SET);
409     for(i=0; i < nDim; i++)
410     {
411       if(fgets(line,LIBSLF_MAXLINELEN, fid) == NULL )
412       {
413         fprintf(stderr,"Model file corrupted.\n");
414 	fclose(fid);
415         goto clean_up;
416       }
417 
418       begptr = line;
419       for(j=0; j < nY; j++)
420       {
421         val = strtod(begptr, &endptr);
422 
423         if(val == 0 && begptr == endptr)
424         {
425           fprintf(stderr,"Model file corrupted.\n");
426 	  fclose(fid);
427           goto clean_up;
428         }
429         begptr = endptr;
430 
431 
432         W[LIBOCAS_INDEX(i,j,nDim)] = val;
433       }
434     }
435     if(verb)
436       printf("done.\n");
437   }
438 
439   fclose(fid);
440 
441 /*  printf("W0=%f, W = [ ", W0);*/
442 /*  for(i=0; i < nDim; i++)*/
443 /*    printf("%f ", W[i]);*/
444 /*  printf("]\n");*/
445 /*  printf("W = [\n");*/
446 /*  for(j=0; j < nDim; j++)*/
447 /*  {*/
448 /*    for(i=0; i < nY; i++)*/
449 /*      printf("%f ", W[LIBOCAS_INDEX(j,i,nDim)]);*/
450 /*    printf("\n");*/
451 /*  }*/
452 
453 /*  load_time = get_time() - load_time;*/
454 
455   /*-----------------------------------------------------
456     Read examples and classify them.
457     -----------------------------------------------------*/
458 
459   fid = fopen(input_fname, "r");
460   if(fid == NULL) {
461     fprintf(stderr,"Cannot open input file.\n");
462     perror("fopen error ");
463     goto clean_up;
464   }
465 
466   if(output_fname == NULL)
467     fout = stdout;
468   else
469   {
470     fout = fopen(output_fname, "w+");
471     if(fout == NULL) {
472       fprintf(stderr,"Cannot open output file.\n");
473       perror("fopen error ");
474       fclose(fid);
475       goto clean_up;
476     }
477   }
478 
479   if(verb)
480   {
481     if(output_fname != NULL)
482       printf("Classifying...");
483     else
484       printf("Outputs:\n");
485   }
486 
487   nClassErrors = (long*)calloc(nY,sizeof(long));
488   if(nClassErrors == NULL)
489   {
490     fprintf(stderr,"Not enough memory for vector nClassError.\n");
491     fclose(fid);
492     if(output_fname != NULL)
493       fclose(fout);
494     goto clean_up;
495   }
496 
497   nClass = (long*)calloc(nY,sizeof(long));
498   if(nClass == NULL)
499   {
500     fprintf(stderr,"Not enough memory for vector nClass.\n");
501     fclose(fid);
502     if(output_fname != NULL)
503       fclose(fout);
504     goto clean_up;
505   }
506 
507 
508   go = 1;
509   while(go) {
510 
511     if(fgets(line,LIBSLF_MAXLINELEN, fid) == NULL )
512     {
513       go = 0;
514     }
515     else
516     {
517       line_cnt ++;
518       nnzf = svmlight_format_parse_line(line, &label, feat_idx, feat_val);
519 
520       if(nnzf == -1)
521       {
522          fprintf(stderr,"Parsing error on line %ld .\n", line_cnt);
523          fprintf(stderr,"Probably defective input file.\n");
524 	 fclose(fid);
525 	 if(output_fname != NULL)
526 	   fclose(fout);
527          goto clean_up;
528       }
529 
530       max_dim = LIBOCAS_MAX(max_dim,feat_idx[nnzf-1]);
531 
532       if(binary_problem == 1)
533       {
534         dfce = W0;
535         for(i=0; i < nnzf; i++)
536         {
537           if(feat_idx[i]-1 < nDim)
538             dfce += feat_val[i]*W[feat_idx[i]-1];
539         }
540 
541         if(label == +1)
542           nClass[0]++;
543         else
544           nClass[1]++;
545 
546         if(dfce >=0 && label == -1)
547         {
548           nClassErrors[1]++;
549           nErrors++;
550         }
551         else if (dfce < 0 && label== +1)
552         {
553           nClassErrors[0]++;
554           nErrors++;
555         }
556 
557         if(output_type == 0)
558         {
559           if(dfce >=0 )
560             fprintf(fout,"+1\n");
561           else
562             fprintf(fout,"-1\n");
563 
564         }
565         else
566           fprintf(fout,"%.20f\n", dfce);
567       }
568       else
569       {
570         max_dfce = -LIBOCAS_PLUS_INF;
571         for(j=0; j < nY; j++)
572         {
573           dfce = 0;
574           for(i=0; i < nnzf; i++)
575           {
576             if(feat_idx[i]-1 < nDim)
577               dfce += feat_val[i]*W[LIBOCAS_INDEX(feat_idx[i]-1,j,nDim)];
578           }
579           if(output_type==1)
580             fprintf(fout,"%.20f ", dfce);
581 
582           if(max_dfce < dfce)
583           {
584             max_dfce = dfce;
585             pred_label = j+1;
586           }
587         }
588         if(output_type==0)
589             fprintf(fout,"%d", pred_label);
590 
591         fprintf(fout,"\n");
592 
593         nClass[label-1]++;
594 
595         if(label != pred_label)
596         {
597           nErrors++;
598           nClassErrors[label-1]++;
599         }
600 
601       }
602     }
603   }
604 
605   if(verb)
606   {
607     if(output_fname != NULL)
608       printf("done.\n");
609 
610     printf("Number of examples: %ld\n"
611            "Maximal dimensionality: %ld\n", line_cnt, max_dim);
612   }
613   if(print_error)
614   {
615     printf("Classification error: %f%%(%ld/%ld)\n", 100.0*(double)nErrors/(double)line_cnt,nErrors,line_cnt);
616     printf("Per-class errors: ");
617     if(binary_problem)
618     {
619       printf("+1: %f%%(%ld/%ld) -1: %f%%(%ld/%ld)\n",
620              100.0*(double)nClassErrors[0]/(double)nClass[0], nClassErrors[0],nClass[0],
621              100.0*(double)nClassErrors[1]/(double)nClass[1], nClassErrors[1],nClass[1]);
622     }
623     else
624     {
625       for(i=0; i < nY; i++)
626         printf("%d: %f%%(%ld/%ld) ", i+1, 100.0*(double)nClassErrors[i]/(double)nClass[i],
627                nClassErrors[i],nClass[i]);
628       printf("\n");
629     }
630   }
631 
632 
633   fclose(fid);
634   if(output_fname != NULL)
635     fclose(fout);
636 
637   exitflag = 0;
638 
639 clean_up:
640 
641   free(input_fname);
642   free(model_fname);
643   free(output_fname);
644   free(W);
645   free(line);
646   free(feat_val);
647   free(feat_idx);
648   free(nClassErrors);
649   free(nClass);
650 
651   return(exitflag);
652 }
653 
654 
655 
656