1#!/usr/bin/env python 2 3from __future__ import print_function 4import sys 5 6try: 7 import scipy 8 from scipy import sparse 9except: 10 scipy = None 11 sparse = None 12 13 14__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale'] 15 16def svm_read_problem(data_file_name, return_scipy=False): 17 """ 18 svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary 19 svm_read_problem(data_file_name, return_scipy=True) -> [y, x], y: ndarray, x: csr_matrix 20 21 Read LIBSVM-format data from data_file_name and return labels y 22 and data instances x. 23 """ 24 prob_y = [] 25 prob_x = [] 26 row_ptr = [0] 27 col_idx = [] 28 for i, line in enumerate(open(data_file_name)): 29 line = line.split(None, 1) 30 # In case an instance with all zero features 31 if len(line) == 1: line += [''] 32 label, features = line 33 prob_y += [float(label)] 34 if scipy != None and return_scipy: 35 nz = 0 36 for e in features.split(): 37 ind, val = e.split(":") 38 val = float(val) 39 if val != 0: 40 col_idx += [int(ind)-1] 41 prob_x += [val] 42 nz += 1 43 row_ptr += [row_ptr[-1]+nz] 44 else: 45 xi = {} 46 for e in features.split(): 47 ind, val = e.split(":") 48 xi[int(ind)] = float(val) 49 prob_x += [xi] 50 if scipy != None and return_scipy: 51 prob_y = scipy.array(prob_y) 52 prob_x = scipy.array(prob_x) 53 col_idx = scipy.array(col_idx) 54 row_ptr = scipy.array(row_ptr) 55 prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) 56 return (prob_y, prob_x) 57 58def evaluations_scipy(ty, pv): 59 """ 60 evaluations_scipy(ty, pv) -> (ACC, MSE, SCC) 61 ty, pv: ndarray 62 63 Calculate accuracy, mean squared error and squared correlation coefficient 64 using the true values (ty) and predicted values (pv). 65 """ 66 if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)): 67 raise TypeError("type of ty and pv must be ndarray") 68 if len(ty) != len(pv): 69 raise ValueError("len(ty) must be equal to len(pv)") 70 ACC = 100.0*(ty == pv).mean() 71 MSE = ((ty - pv)**2).mean() 72 l = len(ty) 73 sumv = pv.sum() 74 sumy = ty.sum() 75 sumvy = (pv*ty).sum() 76 sumvv = (pv*pv).sum() 77 sumyy = (ty*ty).sum() 78 with scipy.errstate(all = 'raise'): 79 try: 80 SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) 81 except: 82 SCC = float('nan') 83 return (float(ACC), float(MSE), float(SCC)) 84 85def evaluations(ty, pv, useScipy = True): 86 """ 87 evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC) 88 ty, pv: list, tuple or ndarray 89 useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation 90 91 Calculate accuracy, mean squared error and squared correlation coefficient 92 using the true values (ty) and predicted values (pv). 93 """ 94 if scipy != None and useScipy: 95 return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv)) 96 if len(ty) != len(pv): 97 raise ValueError("len(ty) must be equal to len(pv)") 98 total_correct = total_error = 0 99 sumv = sumy = sumvv = sumyy = sumvy = 0 100 for v, y in zip(pv, ty): 101 if y == v: 102 total_correct += 1 103 total_error += (v-y)*(v-y) 104 sumv += v 105 sumy += y 106 sumvv += v*v 107 sumyy += y*y 108 sumvy += v*y 109 l = len(ty) 110 ACC = 100.0*total_correct/l 111 MSE = total_error/l 112 try: 113 SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy)) 114 except: 115 SCC = float('nan') 116 return (float(ACC), float(MSE), float(SCC)) 117 118def csr_find_scale_param(x, lower=-1, upper=1): 119 assert isinstance(x, sparse.csr_matrix) 120 assert lower < upper 121 l, n = x.shape 122 feat_min = x.min(axis=0).toarray().flatten() 123 feat_max = x.max(axis=0).toarray().flatten() 124 coef = (feat_max - feat_min) / (upper - lower) 125 coef[coef != 0] = 1.0 / coef[coef != 0] 126 127 # (x - ones(l,1) * feat_min') * diag(coef) + lower 128 # = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower 129 # = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower) 130 # = x * diag(coef) + ones(l, 1) * offset' 131 offset = -feat_min * coef + lower 132 offset[coef == 0] = 0 133 134 if sum(offset != 0) * l > 3 * x.getnnz(): 135 print( 136 "WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n" 137 "If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.", 138 file=sys.stderr) 139 140 return {'coef':coef, 'offset':offset} 141 142def csr_scale(x, scale_param): 143 assert isinstance(x, sparse.csr_matrix) 144 145 offset = scale_param['offset'] 146 coef = scale_param['coef'] 147 assert len(coef) == len(offset) 148 149 l, n = x.shape 150 151 if not n == len(coef): 152 print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr) 153 coef = resize(coef, n) 154 offset = resize(offset, n) 155 156 # scaled_x = x * diag(coef) + ones(l, 1) * offset' 157 offset = sparse.csr_matrix(offset.reshape(1, n)) 158 offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype) 159 scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset 160 161 if scaled_x.getnnz() > x.getnnz(): 162 print( 163 "WARNING: original #nonzeros %d\n" % x.getnnz() + 164 " > new #nonzeros %d\n" % scaled_x.getnnz() + 165 "If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.", 166 file=sys.stderr) 167