1 /************************************************************************/
2 /* */
3 /* svm_common.c */
4 /* */
5 /* Definitions and functions used in both svm_learn and svm_classify. */
6 /* */
7 /* Author: Thorsten Joachims */
8 /* Date: 02.07.02 */
9 /* */
10 /* Copyright (c) 2002 Thorsten Joachims - All rights reserved */
11 /* */
12 /* This software is available for non-commercial use only. It must */
13 /* not be modified and distributed without prior permission of the */
14 /* author. The author is not responsible for implications from the */
15 /* use of this software. */
16 /* */
17 /************************************************************************/
18
19 # include "ctype.h"
20 # include "svm_common.h"
21 # include "kernel.h" /* this contains a user supplied kernel */
22
23 long kernel_cache_statistic;
24
classify_example(MODEL * model,DOC * ex)25 double classify_example(MODEL *model, DOC *ex)
26 /* classifies one example */
27 {
28 register long i;
29 register double dist;
30
31 dist=0;
32 for(i=1;i<model->sv_num;i++) {
33 dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
34 }
35 return(dist-model->b);
36 }
37
classify_example_linear(MODEL * model,DOC * ex)38 double classify_example_linear(MODEL *model, DOC *ex)
39 /* classifies example for linear kernel */
40
41 /* important: the model must have the linear weight vector computed */
42
43 /* important: the feature numbers in the example to classify must */
44 /* not be larger than the weight vector! */
45 {
46 return((double)(sprod_ns(model->lin_weights,ex->words)-model->b));
47 }
48
kernel(KERNEL_PARM * kernel_parm,DOC * a,DOC * b)49 CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b)
50 /* calculate the kernel function */
51 {
52 kernel_cache_statistic++;
53 switch(kernel_parm->kernel_type) {
54 case 0: /* linear */
55 return((CFLOAT)sprod_ss(a->words,b->words));
56 case 1: /* polynomial */
57 return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a->words,b->words)+kernel_parm->coef_const,(double)kernel_parm->poly_degree));
58 case 2: /* radial basis function */
59 return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a->words,b->words)+b->twonorm_sq)));
60 case 3: /* sigmoid neural net */
61 return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a->words,b->words)+kernel_parm->coef_const));
62 case 4: /* custom-kernel supplied in file kernel.h*/
63 return((CFLOAT)custom_kernel(kernel_parm,a,b));
64 default: printf("Error: Unknown kernel function\n"); exit(1);
65 }
66 }
67
sprod_ss(WORD * a,WORD * b)68 double sprod_ss(WORD *a, WORD *b)
69 /* compute the inner product of two sparse vectors */
70 {
71 register FVAL sum=0;
72 register WORD *ai,*bj;
73
74 ai=a;
75 bj=b;
76 while (ai->wnum && bj->wnum) {
77 if(ai->wnum > bj->wnum) {
78 bj++;
79 }
80 else if (ai->wnum < bj->wnum) {
81 ai++;
82 }
83 else {
84 sum+=ai->weight * bj->weight;
85 ai++;
86 bj++;
87 }
88 }
89
90 return((double)sum);
91 }
92
sub_ss(WORD * a,WORD * b)93 WORD* sub_ss(WORD *a, WORD *b)
94 /* compute the difference a-b of two sparse vectors */
95 {
96 register WORD *sum,*sumi;
97 register WORD *ai,*bj;
98 long veclength;
99
100 ai=a;
101 bj=b;
102 veclength=0;
103 while (ai->wnum && bj->wnum) {
104 if(ai->wnum > bj->wnum) {
105 veclength++;
106 bj++;
107 }
108 else if (ai->wnum < bj->wnum) {
109 veclength++;
110 ai++;
111 }
112 else {
113 veclength++;
114 ai++;
115 bj++;
116 }
117 }
118 while (bj->wnum) {
119 veclength++;
120 bj++;
121 }
122 while (ai->wnum) {
123 veclength++;
124 ai++;
125 }
126 veclength++;
127
128 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
129 sumi=sum;
130 ai=a;
131 bj=b;
132 while (ai->wnum && bj->wnum) {
133 if(ai->wnum > bj->wnum) {
134 (*sumi)=(*bj);
135 sumi->weight*=(-1);
136 sumi++;
137 bj++;
138 }
139 else if (ai->wnum < bj->wnum) {
140 (*sumi)=(*ai);
141 sumi++;
142 ai++;
143 }
144 else {
145 (*sumi)=(*ai);
146 sumi->weight-=bj->weight;
147 sumi++;
148 ai++;
149 bj++;
150 }
151 }
152 while (bj->wnum) {
153 (*sumi)=(*bj);
154 sumi->weight*=(-1);
155 sumi++;
156 bj++;
157 }
158 while (ai->wnum) {
159 (*sumi)=(*ai);
160 sumi++;
161 ai++;
162 }
163 sumi->wnum=0;
164 return(sum);
165 }
166
model_length_s(MODEL * model,KERNEL_PARM * kernel_parm)167 double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm)
168 /* compute length of weight vector */
169 {
170 register long i,j;
171 register double sum=0,alphai;
172 register DOC *supveci;
173
174 for(i=1;i<model->sv_num;i++) {
175 alphai=model->alpha[i];
176 supveci=model->supvec[i];
177 for(j=1;j<model->sv_num;j++) {
178 sum+=alphai*model->alpha[j]
179 *kernel(kernel_parm,supveci,model->supvec[j]);
180 }
181 }
182 return(sqrt(sum));
183 }
184
clear_vector_n(double * vec,long int n)185 void clear_vector_n(double *vec, long int n)
186 {
187 register long i;
188 for(i=0;i<=n;i++) vec[i]=0;
189 }
190
add_vector_ns(double * vec_n,WORD * vec_s,double faktor)191 void add_vector_ns(double *vec_n, WORD *vec_s, double faktor)
192 {
193 register WORD *ai;
194 ai=vec_s;
195 while (ai->wnum) {
196 vec_n[ai->wnum]+=(faktor*ai->weight);
197 ai++;
198 }
199 }
200
sprod_ns(double * vec_n,WORD * vec_s)201 double sprod_ns(double *vec_n, WORD *vec_s)
202 {
203 register double sum=0;
204 register WORD *ai;
205 ai=vec_s;
206 while (ai->wnum) {
207 sum+=(vec_n[ai->wnum]*ai->weight);
208 ai++;
209 }
210 return(sum);
211 }
212
add_weight_vector_to_linear_model(MODEL * model)213 void add_weight_vector_to_linear_model(MODEL *model)
214 /* compute weight vector in linear case and add to model */
215 {
216 long i;
217
218 model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
219 clear_vector_n(model->lin_weights,model->totwords);
220 for(i=1;i<model->sv_num;i++) {
221 add_vector_ns(model->lin_weights,(model->supvec[i])->words,
222 model->alpha[i]);
223 }
224 }
225
read_model(char * modelfile,MODEL * model,long int max_words,long int ll)226 void read_model(char *modelfile, MODEL *model, long int max_words, long int ll)
227 {
228 FILE *modelfl;
229 long j,i;
230 char *line;
231 WORD *words;
232 register long wpos;
233 long wnum,pos;
234 double weight;
235 char version_buffer[100];
236 int numread;
237
238 if(verbosity>=1) {
239 printf("Reading model..."); fflush(stdout);
240 }
241 words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
242 line = (char *)my_malloc(sizeof(char)*ll);
243
244 if ((modelfl = fopen (modelfile, "r")) == NULL)
245 { perror (modelfile); exit (1); }
246
247 fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
248 if(strcmp(version_buffer,VERSION_SVMLIGHT)) {
249 perror ("Version of model-file does not match version of svm_classify!");
250 exit (1);
251 }
252 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
253 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
254 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
255 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
256 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
257 fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
258
259 fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
260 fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
261 fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
262 fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
263
264 for(i=1;i<model->sv_num;i++) {
265 fgets(line,(int)ll,modelfl);
266 pos=0;
267 wpos=0;
268 sscanf(line,"%lf",&model->alpha[i]);
269 while(!isspace((int)line[++pos]));
270 while(((numread=sscanf(line+pos,"%ld:%lf",&wnum,&weight)) != EOF)
271 && (wpos<max_words)) {
272 if(numread != 2) {
273 perror("Parsing error while reading model!");
274 printf("LINE: %s\n",line);
275 }
276 while(!isspace((int)line[++pos]));
277 words[wpos].wnum=wnum;
278 words[wpos].weight=(FVAL)weight;
279 wpos++;
280 }
281 model->supvec[i] = (DOC *)my_malloc(sizeof(DOC));
282 (model->supvec[i])->words = (WORD *)my_malloc(sizeof(WORD)*(wpos+1));
283 for(j=0;j<wpos;j++) {
284 (model->supvec[i])->words[j]=words[j];
285 }
286 ((model->supvec[i])->words[wpos]).wnum=0;
287 (model->supvec[i])->twonorm_sq = sprod_ss((model->supvec[i])->words,
288 (model->supvec[i])->words);
289 (model->supvec[i])->docnum = -1;
290 }
291 fclose(modelfl);
292 free(line);
293 free(words);
294 if(verbosity>=1) {
295 fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
296 }
297 }
298
read_documents(char * docfile,DOC * docs,double * label,long int max_words_doc,long int ll,long int * totwords,long int * totdoc)299 void read_documents(char *docfile, DOC *docs, double *label,
300 long int max_words_doc, long int ll,
301 long int *totwords, long int *totdoc)
302 {
303 char *line;
304 DOC doc;
305 long dnum=0,wpos,i,dpos=0,dneg=0,dunlab=0;
306 double doc_label;
307 FILE *docfl;
308
309 line = (char *)my_malloc(sizeof(char)*ll);
310
311 if ((docfl = fopen (docfile, "r")) == NULL)
312 { perror (docfile); exit (1); }
313
314 doc.words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
315 if(verbosity>=1) {
316 printf("Reading examples into memory..."); fflush(stdout);
317 }
318 dnum=0;
319 (*totwords)=0;
320 while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
321 if(line[0] == '#') continue; /* line contains comments */
322 if(!parse_document(line,&doc,&doc_label,&wpos,max_words_doc)) {
323 printf("\nParsing error in line %ld!\n%s",dnum,line);
324 exit(1);
325 }
326 label[dnum]=doc_label;
327 /* printf("Class=%ld ",doc_label); */
328 if(doc_label > 0) dpos++;
329 if (doc_label < 0) dneg++;
330 if (doc_label == 0) dunlab++;
331 if((wpos>1) && ((doc.words[wpos-2]).wnum>(*totwords)))
332 (*totwords)=(doc.words[wpos-2]).wnum;
333 docs[dnum].queryid = doc.queryid;
334 docs[dnum].costfactor = doc.costfactor;
335 docs[dnum].words = (WORD *)my_malloc(sizeof(WORD)*(wpos));
336 docs[dnum].docnum=dnum;
337 for(i=0;i<wpos;i++) {
338 docs[dnum].words[i]=doc.words[i];
339 /* printf("%ld::%f ",(docs[dnum].words[i]).wnum,(docs[dnum].words[i]).weight); */
340
341 }
342 docs[dnum].twonorm_sq=doc.twonorm_sq;
343 /* printf("\nNorm=%f\n",docs[dnum].twonorm_sq); */
344 dnum++;
345 if(verbosity>=1) {
346 if((dnum % 100) == 0) {
347 printf("%ld..",dnum); fflush(stdout);
348 }
349 }
350 }
351
352 fclose(docfl);
353 free(line);
354 free(doc.words);
355 if(verbosity>=1) {
356 fprintf(stdout, "OK. (%ld examples read)\n", dnum);
357 }
358 (*totdoc)=dnum;
359 }
360
parse_document(char * line,DOC * doc,double * label,long int * numwords,long int max_words_doc)361 int parse_document(char *line, DOC *doc, double *label,
362 long int *numwords, long int max_words_doc)
363 {
364 register long wpos,pos;
365 long wnum;
366 double weight;
367 int numread;
368 char featurepair[1000],junk[1000];
369
370 doc->queryid=0;
371 doc->costfactor=1;
372
373 pos=0;
374 while(line[pos]) { /* cut off comments */
375 if(line[pos] == '#') {
376 line[pos]=0;
377 }
378 else {
379 pos++;
380 }
381 }
382 wpos=0;
383 if(sscanf(line,"%lf",label) == EOF) return(0);
384 pos=0;
385 while(isspace((int)line[pos])) pos++;
386 while((!isspace((int)line[pos])) && line[pos]) pos++;
387 while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
388 (wpos<max_words_doc)) {
389 /* printf("%s\n",featurepair); */
390 while(isspace((int)line[pos])) pos++;
391 while((!isspace((int)line[pos])) && line[pos]) pos++;
392 if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
393 /* it is the query id */
394 doc->queryid=(long)wnum;
395 }
396 else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
397 /* it is the example-dependent cost factor */
398 doc->costfactor=(double)weight;
399 }
400 else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
401 /* it is a regular feature */
402 if(wnum<=0) {
403 perror ("Feature numbers must be larger or equal to 1!!!\n");
404 printf("LINE: %s\n",line);
405 exit (1);
406 }
407 if((wpos>0) && ((doc->words[wpos-1]).wnum >= wnum)) {
408 perror ("Features must be in increasing order!!!\n");
409 printf("LINE: %s\n",line);
410 exit (1);
411 }
412 (doc->words[wpos]).wnum=wnum;
413 (doc->words[wpos]).weight=(FVAL)weight;
414 wpos++;
415 }
416 else {
417 perror ("Cannot parse feature/value pair!!!\n");
418 printf("'%s' in LINE: %s\n",featurepair,line);
419 exit (1);
420 }
421 }
422 (doc->words[wpos]).wnum=0;
423 (*numwords)=wpos+1;
424 doc->docnum=-1;
425 doc->twonorm_sq=sprod_ss(doc->words,doc->words);
426 return(1);
427 }
428
nol_ll(char * file,long int * nol,long int * wol,long int * ll)429 void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
430 /* Grep through file and count number of lines, maximum number of
431 spaces per line, and longest line. */
432 {
433 FILE *fl;
434 int ic;
435 char c;
436 long current_length,current_wol;
437
438 if ((fl = fopen (file, "r")) == NULL)
439 { perror (file); exit (1); }
440 current_length=0;
441 current_wol=0;
442 (*ll)=0;
443 (*nol)=1;
444 (*wol)=0;
445 while((ic=getc(fl)) != EOF) {
446 c=(char)ic;
447 current_length++;
448 if(isspace((int)c)) {
449 current_wol++;
450 }
451 if(c == '\n') {
452 (*nol)++;
453 if(current_length>(*ll)) {
454 (*ll)=current_length;
455 }
456 if(current_wol>(*wol)) {
457 (*wol)=current_wol;
458 }
459 current_length=0;
460 current_wol=0;
461 }
462 }
463 fclose(fl);
464 }
465
minl(long int a,long int b)466 long minl(long int a, long int b)
467 {
468 if(a<b)
469 return(a);
470 else
471 return(b);
472 }
473
maxl(long int a,long int b)474 long maxl(long int a, long int b)
475 {
476 if(a>b)
477 return(a);
478 else
479 return(b);
480 }
481
get_runtime(void)482 long get_runtime(void)
483 {
484 clock_t start;
485 start = clock();
486 return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
487 }
488
489
490 # ifdef MICROSOFT
491
isnan(double a)492 int isnan(double a)
493 {
494 return(_isnan(a));
495 }
496
497 # endif
498
499
my_malloc(size_t size)500 void *my_malloc(size_t size)
501 {
502 void *ptr;
503 ptr=(void *)malloc(size);
504 if(!ptr) {
505 perror ("Out of memory!\n");
506 exit (1);
507 }
508 return(ptr);
509 }
510
copyright_notice(void)511 void copyright_notice(void)
512 {
513 printf("\nCopyright: Thorsten Joachims, thorsten@ls8.cs.uni-dortmund.de\n\n");
514 printf("This software is available for non-commercial use only. It must not\n");
515 printf("be modified and distributed without prior permission of the author.\n");
516 printf("The author is not responsible for implications from the use of this\n");
517 printf("software.\n\n");
518 }
519