1 /************************************************************************/
2 /*                                                                      */
3 /*   svm_common.c                                                       */
4 /*                                                                      */
5 /*   Definitions and functions used in both svm_learn and svm_classify. */
6 /*                                                                      */
7 /*   Author: Thorsten Joachims                                          */
8 /*   Date: 02.07.02                                                     */
9 /*                                                                      */
10 /*   Copyright (c) 2002  Thorsten Joachims - All rights reserved        */
11 /*                                                                      */
12 /*   This software is available for non-commercial use only. It must    */
13 /*   not be modified and distributed without prior permission of the    */
14 /*   author. The author is not responsible for implications from the    */
15 /*   use of this software.                                              */
16 /*                                                                      */
17 /************************************************************************/
18 
19 # include "ctype.h"
20 # include "svm_common.h"
21 # include "kernel.h"           /* this contains a user supplied kernel */
22 
23 long   kernel_cache_statistic;
24 
classify_example(MODEL * model,DOC * ex)25 double classify_example(MODEL *model, DOC *ex)
26      /* classifies one example */
27 {
28   register long i;
29   register double dist;
30 
31   dist=0;
32   for(i=1;i<model->sv_num;i++) {
33     dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
34   }
35   return(dist-model->b);
36 }
37 
classify_example_linear(MODEL * model,DOC * ex)38 double classify_example_linear(MODEL *model, DOC *ex)
39      /* classifies example for linear kernel */
40 
41      /* important: the model must have the linear weight vector computed */
42 
43      /* important: the feature numbers in the example to classify must */
44      /*            not be larger than the weight vector!               */
45 {
46   return((double)(sprod_ns(model->lin_weights,ex->words)-model->b));
47 }
48 
kernel(KERNEL_PARM * kernel_parm,DOC * a,DOC * b)49 CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b)
50      /* calculate the kernel function */
51 {
52   kernel_cache_statistic++;
53   switch(kernel_parm->kernel_type) {
54     case 0: /* linear */
55             return((CFLOAT)sprod_ss(a->words,b->words));
56     case 1: /* polynomial */
57             return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a->words,b->words)+kernel_parm->coef_const,(double)kernel_parm->poly_degree));
58     case 2: /* radial basis function */
59             return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a->words,b->words)+b->twonorm_sq)));
60     case 3: /* sigmoid neural net */
61             return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a->words,b->words)+kernel_parm->coef_const));
62     case 4: /* custom-kernel supplied in file kernel.h*/
63             return((CFLOAT)custom_kernel(kernel_parm,a,b));
64     default: printf("Error: Unknown kernel function\n"); exit(1);
65   }
66 }
67 
sprod_ss(WORD * a,WORD * b)68 double sprod_ss(WORD *a, WORD *b)
69      /* compute the inner product of two sparse vectors */
70 {
71     register FVAL sum=0;
72     register WORD *ai,*bj;
73 
74     ai=a;
75     bj=b;
76     while (ai->wnum && bj->wnum) {
77       if(ai->wnum > bj->wnum) {
78         bj++;
79       }
80       else if (ai->wnum < bj->wnum) {
81         ai++;
82       }
83       else {
84         sum+=ai->weight * bj->weight;
85         ai++;
86         bj++;
87       }
88     }
89 
90     return((double)sum);
91 }
92 
sub_ss(WORD * a,WORD * b)93 WORD* sub_ss(WORD *a, WORD *b)
94      /* compute the difference a-b of two sparse vectors */
95 {
96     register WORD *sum,*sumi;
97     register WORD *ai,*bj;
98     long veclength;
99 
100     ai=a;
101     bj=b;
102     veclength=0;
103     while (ai->wnum && bj->wnum) {
104       if(ai->wnum > bj->wnum) {
105 	veclength++;
106 	bj++;
107       }
108       else if (ai->wnum < bj->wnum) {
109 	veclength++;
110 	ai++;
111       }
112       else {
113 	veclength++;
114 	ai++;
115 	bj++;
116       }
117     }
118     while (bj->wnum) {
119       veclength++;
120       bj++;
121     }
122     while (ai->wnum) {
123       veclength++;
124       ai++;
125     }
126     veclength++;
127 
128     sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
129     sumi=sum;
130     ai=a;
131     bj=b;
132     while (ai->wnum && bj->wnum) {
133       if(ai->wnum > bj->wnum) {
134 	(*sumi)=(*bj);
135 	sumi->weight*=(-1);
136 	sumi++;
137 	bj++;
138       }
139       else if (ai->wnum < bj->wnum) {
140 	(*sumi)=(*ai);
141 	sumi++;
142 	ai++;
143       }
144       else {
145 	(*sumi)=(*ai);
146 	sumi->weight-=bj->weight;
147 	sumi++;
148 	ai++;
149 	bj++;
150       }
151     }
152     while (bj->wnum) {
153       (*sumi)=(*bj);
154       sumi->weight*=(-1);
155       sumi++;
156       bj++;
157     }
158     while (ai->wnum) {
159       (*sumi)=(*ai);
160       sumi++;
161       ai++;
162     }
163     sumi->wnum=0;
164     return(sum);
165 }
166 
model_length_s(MODEL * model,KERNEL_PARM * kernel_parm)167 double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm)
168      /* compute length of weight vector */
169 {
170   register long i,j;
171   register double sum=0,alphai;
172   register DOC *supveci;
173 
174   for(i=1;i<model->sv_num;i++) {
175     alphai=model->alpha[i];
176     supveci=model->supvec[i];
177     for(j=1;j<model->sv_num;j++) {
178       sum+=alphai*model->alpha[j]
179 	   *kernel(kernel_parm,supveci,model->supvec[j]);
180     }
181   }
182   return(sqrt(sum));
183 }
184 
clear_vector_n(double * vec,long int n)185 void clear_vector_n(double *vec, long int n)
186 {
187   register long i;
188   for(i=0;i<=n;i++) vec[i]=0;
189 }
190 
add_vector_ns(double * vec_n,WORD * vec_s,double faktor)191 void add_vector_ns(double *vec_n, WORD *vec_s, double faktor)
192 {
193   register WORD *ai;
194   ai=vec_s;
195   while (ai->wnum) {
196     vec_n[ai->wnum]+=(faktor*ai->weight);
197     ai++;
198   }
199 }
200 
sprod_ns(double * vec_n,WORD * vec_s)201 double sprod_ns(double *vec_n, WORD *vec_s)
202 {
203   register double sum=0;
204   register WORD *ai;
205   ai=vec_s;
206   while (ai->wnum) {
207     sum+=(vec_n[ai->wnum]*ai->weight);
208     ai++;
209   }
210   return(sum);
211 }
212 
add_weight_vector_to_linear_model(MODEL * model)213 void add_weight_vector_to_linear_model(MODEL *model)
214      /* compute weight vector in linear case and add to model */
215 {
216   long i;
217 
218   model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
219   clear_vector_n(model->lin_weights,model->totwords);
220   for(i=1;i<model->sv_num;i++) {
221     add_vector_ns(model->lin_weights,(model->supvec[i])->words,
222 		  model->alpha[i]);
223   }
224 }
225 
read_model(char * modelfile,MODEL * model,long int max_words,long int ll)226 void read_model(char *modelfile, MODEL *model, long int max_words, long int ll)
227 {
228   FILE *modelfl;
229   long j,i;
230   char *line;
231   WORD *words;
232   register long wpos;
233   long wnum,pos;
234   double weight;
235   char version_buffer[100];
236   int numread;
237 
238   if(verbosity>=1) {
239     printf("Reading model..."); fflush(stdout);
240   }
241   words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
242   line = (char *)my_malloc(sizeof(char)*ll);
243 
244   if ((modelfl = fopen (modelfile, "r")) == NULL)
245   { perror (modelfile); exit (1); }
246 
247   fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
248   if(strcmp(version_buffer,VERSION_SVMLIGHT)) {
249     perror ("Version of model-file does not match version of svm_classify!");
250     exit (1);
251   }
252   fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
253   fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
254   fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
255   fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
256   fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
257   fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
258 
259   fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
260   fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
261   fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
262   fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
263 
264   for(i=1;i<model->sv_num;i++) {
265     fgets(line,(int)ll,modelfl);
266     pos=0;
267     wpos=0;
268     sscanf(line,"%lf",&model->alpha[i]);
269     while(!isspace((int)line[++pos]));
270     while(((numread=sscanf(line+pos,"%ld:%lf",&wnum,&weight)) != EOF)
271 	  && (wpos<max_words)) {
272       if(numread != 2) {
273 	perror("Parsing error while reading model!");
274 	printf("LINE: %s\n",line);
275       }
276       while(!isspace((int)line[++pos]));
277       words[wpos].wnum=wnum;
278       words[wpos].weight=(FVAL)weight;
279       wpos++;
280     }
281     model->supvec[i] = (DOC *)my_malloc(sizeof(DOC));
282     (model->supvec[i])->words = (WORD *)my_malloc(sizeof(WORD)*(wpos+1));
283     for(j=0;j<wpos;j++) {
284       (model->supvec[i])->words[j]=words[j];
285     }
286     ((model->supvec[i])->words[wpos]).wnum=0;
287     (model->supvec[i])->twonorm_sq = sprod_ss((model->supvec[i])->words,
288 					      (model->supvec[i])->words);
289     (model->supvec[i])->docnum = -1;
290   }
291   fclose(modelfl);
292   free(line);
293   free(words);
294   if(verbosity>=1) {
295     fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
296   }
297 }
298 
read_documents(char * docfile,DOC * docs,double * label,long int max_words_doc,long int ll,long int * totwords,long int * totdoc)299 void read_documents(char *docfile, DOC *docs, double *label,
300 		    long int max_words_doc, long int ll,
301 		    long int *totwords, long int *totdoc)
302 {
303   char *line;
304   DOC doc;
305   long dnum=0,wpos,i,dpos=0,dneg=0,dunlab=0;
306   double doc_label;
307   FILE *docfl;
308 
309   line = (char *)my_malloc(sizeof(char)*ll);
310 
311   if ((docfl = fopen (docfile, "r")) == NULL)
312   { perror (docfile); exit (1); }
313 
314   doc.words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
315   if(verbosity>=1) {
316     printf("Reading examples into memory..."); fflush(stdout);
317   }
318   dnum=0;
319   (*totwords)=0;
320   while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
321     if(line[0] == '#') continue;  /* line contains comments */
322     if(!parse_document(line,&doc,&doc_label,&wpos,max_words_doc)) {
323       printf("\nParsing error in line %ld!\n%s",dnum,line);
324       exit(1);
325     }
326     label[dnum]=doc_label;
327     /*  printf("Class=%ld ",doc_label);  */
328     if(doc_label > 0) dpos++;
329     if (doc_label < 0) dneg++;
330     if (doc_label == 0) dunlab++;
331     if((wpos>1) && ((doc.words[wpos-2]).wnum>(*totwords)))
332       (*totwords)=(doc.words[wpos-2]).wnum;
333     docs[dnum].queryid = doc.queryid;
334     docs[dnum].costfactor = doc.costfactor;
335     docs[dnum].words = (WORD *)my_malloc(sizeof(WORD)*(wpos));
336     docs[dnum].docnum=dnum;
337     for(i=0;i<wpos;i++) {
338       docs[dnum].words[i]=doc.words[i];
339       /*  printf("%ld::%f ",(docs[dnum].words[i]).wnum,(docs[dnum].words[i]).weight);  */
340 
341     }
342     docs[dnum].twonorm_sq=doc.twonorm_sq;
343     /* printf("\nNorm=%f\n",docs[dnum].twonorm_sq);  */
344     dnum++;
345     if(verbosity>=1) {
346       if((dnum % 100) == 0) {
347 	printf("%ld..",dnum); fflush(stdout);
348       }
349     }
350   }
351 
352   fclose(docfl);
353   free(line);
354   free(doc.words);
355   if(verbosity>=1) {
356     fprintf(stdout, "OK. (%ld examples read)\n", dnum);
357   }
358   (*totdoc)=dnum;
359 }
360 
parse_document(char * line,DOC * doc,double * label,long int * numwords,long int max_words_doc)361 int parse_document(char *line, DOC *doc, double *label,
362 		   long int *numwords, long int max_words_doc)
363 {
364   register long wpos,pos;
365   long wnum;
366   double weight;
367   int numread;
368   char featurepair[1000],junk[1000];
369 
370   doc->queryid=0;
371   doc->costfactor=1;
372 
373   pos=0;
374   while(line[pos]) {      /* cut off comments */
375     if(line[pos] == '#') {
376       line[pos]=0;
377     }
378     else {
379       pos++;
380     }
381   }
382   wpos=0;
383   if(sscanf(line,"%lf",label) == EOF) return(0);
384   pos=0;
385   while(isspace((int)line[pos])) pos++;
386   while((!isspace((int)line[pos])) && line[pos]) pos++;
387   while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
388 	(wpos<max_words_doc)) {
389     /* printf("%s\n",featurepair); */
390     while(isspace((int)line[pos])) pos++;
391     while((!isspace((int)line[pos])) && line[pos]) pos++;
392     if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
393       /* it is the query id */
394       doc->queryid=(long)wnum;
395     }
396     else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
397       /* it is the example-dependent cost factor */
398       doc->costfactor=(double)weight;
399     }
400     else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
401       /* it is a regular feature */
402       if(wnum<=0) {
403 	perror ("Feature numbers must be larger or equal to 1!!!\n");
404 	printf("LINE: %s\n",line);
405 	exit (1);
406       }
407       if((wpos>0) && ((doc->words[wpos-1]).wnum >= wnum)) {
408 	perror ("Features must be in increasing order!!!\n");
409 	printf("LINE: %s\n",line);
410 	exit (1);
411       }
412       (doc->words[wpos]).wnum=wnum;
413       (doc->words[wpos]).weight=(FVAL)weight;
414       wpos++;
415     }
416     else {
417       perror ("Cannot parse feature/value pair!!!\n");
418       printf("'%s' in LINE: %s\n",featurepair,line);
419       exit (1);
420     }
421   }
422   (doc->words[wpos]).wnum=0;
423   (*numwords)=wpos+1;
424   doc->docnum=-1;
425   doc->twonorm_sq=sprod_ss(doc->words,doc->words);
426   return(1);
427 }
428 
nol_ll(char * file,long int * nol,long int * wol,long int * ll)429 void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
430      /* Grep through file and count number of lines, maximum number of
431         spaces per line, and longest line. */
432 {
433   FILE *fl;
434   int ic;
435   char c;
436   long current_length,current_wol;
437 
438   if ((fl = fopen (file, "r")) == NULL)
439   { perror (file); exit (1); }
440   current_length=0;
441   current_wol=0;
442   (*ll)=0;
443   (*nol)=1;
444   (*wol)=0;
445   while((ic=getc(fl)) != EOF) {
446     c=(char)ic;
447     current_length++;
448     if(isspace((int)c)) {
449       current_wol++;
450     }
451     if(c == '\n') {
452       (*nol)++;
453       if(current_length>(*ll)) {
454 	(*ll)=current_length;
455       }
456       if(current_wol>(*wol)) {
457 	(*wol)=current_wol;
458       }
459       current_length=0;
460       current_wol=0;
461     }
462   }
463   fclose(fl);
464 }
465 
minl(long int a,long int b)466 long minl(long int a, long int b)
467 {
468   if(a<b)
469     return(a);
470   else
471     return(b);
472 }
473 
maxl(long int a,long int b)474 long maxl(long int a, long int b)
475 {
476   if(a>b)
477     return(a);
478   else
479     return(b);
480 }
481 
get_runtime(void)482 long get_runtime(void)
483 {
484   clock_t start;
485   start = clock();
486   return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
487 }
488 
489 
490 # ifdef MICROSOFT
491 
isnan(double a)492 int isnan(double a)
493 {
494   return(_isnan(a));
495 }
496 
497 # endif
498 
499 
my_malloc(size_t size)500 void *my_malloc(size_t size)
501 {
502   void *ptr;
503   ptr=(void *)malloc(size);
504   if(!ptr) {
505     perror ("Out of memory!\n");
506     exit (1);
507   }
508   return(ptr);
509 }
510 
copyright_notice(void)511 void copyright_notice(void)
512 {
513   printf("\nCopyright: Thorsten Joachims, thorsten@ls8.cs.uni-dortmund.de\n\n");
514   printf("This software is available for non-commercial use only. It must not\n");
515   printf("be modified and distributed without prior permission of the author.\n");
516   printf("The author is not responsible for implications from the use of this\n");
517   printf("software.\n\n");
518 }
519