1#!/usr/bin/env python
2
3from __future__ import print_function
4import sys
5
6try:
7	import scipy
8	from scipy import sparse
9except:
10	scipy = None
11	sparse = None
12
13
14__all__ = ['svm_read_problem', 'evaluations', 'csr_find_scale_param', 'csr_scale']
15
16def svm_read_problem(data_file_name, return_scipy=False):
17	"""
18	svm_read_problem(data_file_name, return_scipy=False) -> [y, x], y: list, x: list of dictionary
19	svm_read_problem(data_file_name, return_scipy=True)  -> [y, x], y: ndarray, x: csr_matrix
20
21	Read LIBSVM-format data from data_file_name and return labels y
22	and data instances x.
23	"""
24	prob_y = []
25	prob_x = []
26	row_ptr = [0]
27	col_idx = []
28	for i, line in enumerate(open(data_file_name)):
29		line = line.split(None, 1)
30		# In case an instance with all zero features
31		if len(line) == 1: line += ['']
32		label, features = line
33		prob_y += [float(label)]
34		if scipy != None and return_scipy:
35			nz = 0
36			for e in features.split():
37				ind, val = e.split(":")
38				val = float(val)
39				if val != 0:
40					col_idx += [int(ind)-1]
41					prob_x += [val]
42					nz += 1
43			row_ptr += [row_ptr[-1]+nz]
44		else:
45			xi = {}
46			for e in features.split():
47				ind, val = e.split(":")
48				xi[int(ind)] = float(val)
49			prob_x += [xi]
50	if scipy != None and return_scipy:
51		prob_y = scipy.array(prob_y)
52		prob_x = scipy.array(prob_x)
53		col_idx = scipy.array(col_idx)
54		row_ptr = scipy.array(row_ptr)
55		prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr))
56	return (prob_y, prob_x)
57
58def evaluations_scipy(ty, pv):
59	"""
60	evaluations_scipy(ty, pv) -> (ACC, MSE, SCC)
61	ty, pv: ndarray
62
63	Calculate accuracy, mean squared error and squared correlation coefficient
64	using the true values (ty) and predicted values (pv).
65	"""
66	if not (scipy != None and isinstance(ty, scipy.ndarray) and isinstance(pv, scipy.ndarray)):
67		raise TypeError("type of ty and pv must be ndarray")
68	if len(ty) != len(pv):
69		raise ValueError("len(ty) must be equal to len(pv)")
70	ACC = 100.0*(ty == pv).mean()
71	MSE = ((ty - pv)**2).mean()
72	l = len(ty)
73	sumv = pv.sum()
74	sumy = ty.sum()
75	sumvy = (pv*ty).sum()
76	sumvv = (pv*pv).sum()
77	sumyy = (ty*ty).sum()
78	with scipy.errstate(all = 'raise'):
79		try:
80			SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
81		except:
82			SCC = float('nan')
83	return (float(ACC), float(MSE), float(SCC))
84
85def evaluations(ty, pv, useScipy = True):
86	"""
87	evaluations(ty, pv, useScipy) -> (ACC, MSE, SCC)
88	ty, pv: list, tuple or ndarray
89	useScipy: convert ty, pv to ndarray, and use scipy functions for the evaluation
90
91	Calculate accuracy, mean squared error and squared correlation coefficient
92	using the true values (ty) and predicted values (pv).
93	"""
94	if scipy != None and useScipy:
95		return evaluations_scipy(scipy.asarray(ty), scipy.asarray(pv))
96	if len(ty) != len(pv):
97		raise ValueError("len(ty) must be equal to len(pv)")
98	total_correct = total_error = 0
99	sumv = sumy = sumvv = sumyy = sumvy = 0
100	for v, y in zip(pv, ty):
101		if y == v:
102			total_correct += 1
103		total_error += (v-y)*(v-y)
104		sumv += v
105		sumy += y
106		sumvv += v*v
107		sumyy += y*y
108		sumvy += v*y
109	l = len(ty)
110	ACC = 100.0*total_correct/l
111	MSE = total_error/l
112	try:
113		SCC = ((l*sumvy-sumv*sumy)*(l*sumvy-sumv*sumy))/((l*sumvv-sumv*sumv)*(l*sumyy-sumy*sumy))
114	except:
115		SCC = float('nan')
116	return (float(ACC), float(MSE), float(SCC))
117
118def csr_find_scale_param(x, lower=-1, upper=1):
119	assert isinstance(x, sparse.csr_matrix)
120	assert lower < upper
121	l, n = x.shape
122	feat_min = x.min(axis=0).toarray().flatten()
123	feat_max = x.max(axis=0).toarray().flatten()
124	coef = (feat_max - feat_min) / (upper - lower)
125	coef[coef != 0] = 1.0 / coef[coef != 0]
126
127	# (x - ones(l,1) * feat_min') * diag(coef) + lower
128	# = x * diag(coef) - ones(l, 1) * (feat_min' * diag(coef)) + lower
129	# = x * diag(coef) + ones(l, 1) * (-feat_min' * diag(coef) + lower)
130	# = x * diag(coef) + ones(l, 1) * offset'
131	offset = -feat_min * coef + lower
132	offset[coef == 0] = 0
133
134	if sum(offset != 0) * l > 3 * x.getnnz():
135		print(
136			"WARNING: The #nonzeros of the scaled data is at least 2 times larger than the original one.\n"
137			"If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.",
138			file=sys.stderr)
139
140	return {'coef':coef, 'offset':offset}
141
142def csr_scale(x, scale_param):
143	assert isinstance(x, sparse.csr_matrix)
144
145	offset = scale_param['offset']
146	coef = scale_param['coef']
147	assert len(coef) == len(offset)
148
149	l, n = x.shape
150
151	if not n == len(coef):
152		print("WARNING: The dimension of scaling parameters and feature number do not match.", file=sys.stderr)
153		coef = resize(coef, n)
154		offset = resize(offset, n)
155
156	# scaled_x = x * diag(coef) + ones(l, 1) * offset'
157	offset = sparse.csr_matrix(offset.reshape(1, n))
158	offset = sparse.vstack([offset] * l, format='csr', dtype=x.dtype)
159	scaled_x = x.dot(sparse.diags(coef, 0, shape=(n, n))) + offset
160
161	if scaled_x.getnnz() > x.getnnz():
162		print(
163			"WARNING: original #nonzeros %d\n" % x.getnnz() +
164			"       > new      #nonzeros %d\n" % scaled_x.getnnz() +
165			"If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.",
166			file=sys.stderr)
167