1#!/usr/bin/env python 2 3import os, sys 4sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path 5from .liblinear import * 6from .liblinear import __all__ as liblinear_all 7from .liblinear import scipy, sparse 8from .commonutil import * 9from .commonutil import __all__ as common_all 10from ctypes import c_double 11 12if sys.version_info[0] < 3: 13 range = xrange 14 from itertools import izip as zip 15 _cstr = lambda s: s.encode("utf-8") if isinstance(s,unicode) else str(s) 16else: 17 _cstr = lambda s: bytes(s, "utf-8") 18 19__all__ = ['load_model', 'save_model', 'train', 'predict'] + liblinear_all + common_all 20 21 22def load_model(model_file_name): 23 """ 24 load_model(model_file_name) -> model 25 26 Load a LIBLINEAR model from model_file_name and return. 27 """ 28 model = liblinear.load_model(_cstr(model_file_name)) 29 if not model: 30 print("can't open model file %s" % model_file_name) 31 return None 32 model = toPyModel(model) 33 return model 34 35def save_model(model_file_name, model): 36 """ 37 save_model(model_file_name, model) -> None 38 39 Save a LIBLINEAR model to the file model_file_name. 40 """ 41 liblinear.save_model(_cstr(model_file_name), model) 42 43def train(arg1, arg2=None, arg3=None): 44 """ 45 train(y, x [, options]) -> model | ACC 46 47 y: a list/tuple/ndarray of l true labels (type must be int/double). 48 49 x: 1. a list/tuple of l training instances. Feature vector of 50 each training instance is a list/tuple or dictionary. 51 52 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). 53 54 train(prob [, options]) -> model | ACC 55 train(prob, param) -> model | ACC 56 57 Train a model from data (y, x) or a problem prob using 58 'options' or a parameter param. 59 60 If '-v' is specified in 'options' (i.e., cross validation) 61 either accuracy (ACC) or mean-squared error (MSE) is returned. 62 63 options: 64 -s type : set type of solver (default 1) 65 for multi-class classification 66 0 -- L2-regularized logistic regression (primal) 67 1 -- L2-regularized L2-loss support vector classification (dual) 68 2 -- L2-regularized L2-loss support vector classification (primal) 69 3 -- L2-regularized L1-loss support vector classification (dual) 70 4 -- support vector classification by Crammer and Singer 71 5 -- L1-regularized L2-loss support vector classification 72 6 -- L1-regularized logistic regression 73 7 -- L2-regularized logistic regression (dual) 74 for regression 75 11 -- L2-regularized L2-loss support vector regression (primal) 76 12 -- L2-regularized L2-loss support vector regression (dual) 77 13 -- L2-regularized L1-loss support vector regression (dual) 78 for outlier detection 79 21 -- one-class support vector machine (dual) 80 -c cost : set the parameter C (default 1) 81 -p epsilon : set the epsilon in loss function of SVR (default 0.1) 82 -e epsilon : set tolerance of termination criterion 83 -s 0 and 2 84 |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, 85 where f is the primal function, (default 0.01) 86 -s 11 87 |f'(w)|_2 <= eps*|f'(w0)|_2 (default 0.0001) 88 -s 1, 3, 4, 7, and 21 89 Dual maximal violation <= eps; similar to libsvm (default 0.1 except 0.01 for -s 21) 90 -s 5 and 6 91 |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, 92 where f is the primal function (default 0.01) 93 -s 12 and 13 94 |f'(alpha)|_1 <= eps |f'(alpha0)|, 95 where f is the dual function (default 0.1) 96 -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) 97 -R : not regularize the bias; must with -B 1 to have the bias; DON'T use this unless you know what it is 98 (for -s 0, 2, 5, 6, 11)" 99 -wi weight: weights adjust the parameter C of different classes (see README for details) 100 -v n: n-fold cross validation mode 101 -C : find parameters (C for -s 0, 2 and C, p for -s 11) 102 -q : quiet mode (no outputs) 103 """ 104 prob, param = None, None 105 if isinstance(arg1, (list, tuple)) or (scipy and isinstance(arg1, scipy.ndarray)): 106 assert isinstance(arg2, (list, tuple)) or (scipy and isinstance(arg2, (scipy.ndarray, sparse.spmatrix))) 107 y, x, options = arg1, arg2, arg3 108 prob = problem(y, x) 109 param = parameter(options) 110 elif isinstance(arg1, problem): 111 prob = arg1 112 if isinstance(arg2, parameter): 113 param = arg2 114 else: 115 param = parameter(arg2) 116 if prob == None or param == None : 117 raise TypeError("Wrong types for the arguments") 118 119 prob.set_bias(param.bias) 120 liblinear.set_print_string_function(param.print_func) 121 err_msg = liblinear.check_parameter(prob, param) 122 if err_msg : 123 raise ValueError('Error: %s' % err_msg) 124 125 if param.flag_find_parameters: 126 nr_fold = param.nr_fold 127 best_C = c_double() 128 best_p = c_double() 129 best_score = c_double() 130 if param.flag_C_specified: 131 start_C = param.C 132 else: 133 start_C = -1.0 134 if param.flag_p_specified: 135 start_p = param.p 136 else: 137 start_p = -1.0 138 liblinear.find_parameters(prob, param, nr_fold, start_C, start_p, best_C, best_p, best_score) 139 if param.solver_type in [L2R_LR, L2R_L2LOSS_SVC]: 140 print("Best C = %g CV accuracy = %g%%\n"% (best_C.value, 100.0*best_score.value)) 141 elif param.solver_type in [L2R_L2LOSS_SVR]: 142 print("Best C = %g Best p = %g CV MSE = %g\n"% (best_C.value, best_p.value, best_score.value)) 143 return best_C.value,best_p.value,best_score.value 144 145 146 elif param.flag_cross_validation: 147 l, nr_fold = prob.l, param.nr_fold 148 target = (c_double * l)() 149 liblinear.cross_validation(prob, param, nr_fold, target) 150 ACC, MSE, SCC = evaluations(prob.y[:l], target[:l]) 151 if param.solver_type in [L2R_L2LOSS_SVR, L2R_L2LOSS_SVR_DUAL, L2R_L1LOSS_SVR_DUAL]: 152 print("Cross Validation Mean squared error = %g" % MSE) 153 print("Cross Validation Squared correlation coefficient = %g" % SCC) 154 return MSE 155 else: 156 print("Cross Validation Accuracy = %g%%" % ACC) 157 return ACC 158 else: 159 m = liblinear.train(prob, param) 160 m = toPyModel(m) 161 162 return m 163 164def predict(y, x, m, options=""): 165 """ 166 predict(y, x, m [, options]) -> (p_labels, p_acc, p_vals) 167 168 y: a list/tuple/ndarray of l true labels (type must be int/double). 169 It is used for calculating the accuracy. Use [] if true labels are 170 unavailable. 171 172 x: 1. a list/tuple of l training instances. Feature vector of 173 each training instance is a list/tuple or dictionary. 174 175 2. an l * n numpy ndarray or scipy spmatrix (n: number of features). 176 177 Predict data (y, x) with the SVM model m. 178 options: 179 -b probability_estimates: whether to output probability estimates, 0 or 1 (default 0); currently for logistic regression only 180 -q quiet mode (no outputs) 181 182 The return tuple contains 183 p_labels: a list of predicted labels 184 p_acc: a tuple including accuracy (for classification), mean-squared 185 error, and squared correlation coefficient (for regression). 186 p_vals: a list of decision values or probability estimates (if '-b 1' 187 is specified). If k is the number of classes, for decision values, 188 each element includes results of predicting k binary-class 189 SVMs. if k = 2 and solver is not MCSVM_CS, only one decision value 190 is returned. For probabilities, each element contains k values 191 indicating the probability that the testing instance is in each class. 192 Note that the order of classes here is the same as 'model.label' 193 field in the model structure. 194 """ 195 196 def info(s): 197 print(s) 198 199 if scipy and isinstance(x, scipy.ndarray): 200 x = scipy.ascontiguousarray(x) # enforce row-major 201 elif sparse and isinstance(x, sparse.spmatrix): 202 x = x.tocsr() 203 elif not isinstance(x, (list, tuple)): 204 raise TypeError("type of x: {0} is not supported!".format(type(x))) 205 206 if (not isinstance(y, (list, tuple))) and (not (scipy and isinstance(y, scipy.ndarray))): 207 raise TypeError("type of y: {0} is not supported!".format(type(y))) 208 209 predict_probability = 0 210 argv = options.split() 211 i = 0 212 while i < len(argv): 213 if argv[i] == '-b': 214 i += 1 215 predict_probability = int(argv[i]) 216 elif argv[i] == '-q': 217 info = print_null 218 else: 219 raise ValueError("Wrong options") 220 i+=1 221 222 solver_type = m.param.solver_type 223 nr_class = m.get_nr_class() 224 nr_feature = m.get_nr_feature() 225 is_prob_model = m.is_probability_model() 226 bias = m.bias 227 if bias >= 0: 228 biasterm = feature_node(nr_feature+1, bias) 229 else: 230 biasterm = feature_node(-1, bias) 231 pred_labels = [] 232 pred_values = [] 233 234 if scipy and isinstance(x, sparse.spmatrix): 235 nr_instance = x.shape[0] 236 else: 237 nr_instance = len(x) 238 239 if predict_probability: 240 if not is_prob_model: 241 raise TypeError('probability output is only supported for logistic regression') 242 prob_estimates = (c_double * nr_class)() 243 for i in range(nr_instance): 244 if scipy and isinstance(x, sparse.spmatrix): 245 indslice = slice(x.indptr[i], x.indptr[i+1]) 246 xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature) 247 else: 248 xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature) 249 xi[-2] = biasterm 250 label = liblinear.predict_probability(m, xi, prob_estimates) 251 values = prob_estimates[:nr_class] 252 pred_labels += [label] 253 pred_values += [values] 254 else: 255 if nr_class <= 2: 256 nr_classifier = 1 257 else: 258 nr_classifier = nr_class 259 dec_values = (c_double * nr_classifier)() 260 for i in range(nr_instance): 261 if scipy and isinstance(x, sparse.spmatrix): 262 indslice = slice(x.indptr[i], x.indptr[i+1]) 263 xi, idx = gen_feature_nodearray((x.indices[indslice], x.data[indslice]), feature_max=nr_feature) 264 else: 265 xi, idx = gen_feature_nodearray(x[i], feature_max=nr_feature) 266 xi[-2] = biasterm 267 label = liblinear.predict_values(m, xi, dec_values) 268 values = dec_values[:nr_classifier] 269 pred_labels += [label] 270 pred_values += [values] 271 272 if len(y) == 0: 273 y = [0] * nr_instance 274 ACC, MSE, SCC = evaluations(y, pred_labels) 275 276 if m.is_regression_model(): 277 info("Mean squared error = %g (regression)" % MSE) 278 info("Squared correlation coefficient = %g (regression)" % SCC) 279 else: 280 info("Accuracy = %g%% (%d/%d) (classification)" % (ACC, int(round(nr_instance*ACC/100)), nr_instance)) 281 282 return pred_labels, (ACC, MSE, SCC), pred_values 283