1# 2# Licensed to the Apache Software Foundation (ASF) under one or more 3# contributor license agreements. See the NOTICE file distributed with 4# this work for additional information regarding copyright ownership. 5# The ASF licenses this file to You under the Apache License, Version 2.0 6# (the "License"); you may not use this file except in compliance with 7# the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18""" 19MLlib utilities for linear algebra. For dense vectors, MLlib 20uses the NumPy C{array} type, so you can simply pass NumPy arrays 21around. For sparse vectors, users can construct a L{SparseVector} 22object from MLlib or pass SciPy C{scipy.sparse} column vectors if 23SciPy is available in their environment. 24""" 25 26import sys 27import array 28import struct 29 30if sys.version >= '3': 31 basestring = str 32 xrange = range 33 import copyreg as copy_reg 34 long = int 35else: 36 from itertools import izip as zip 37 import copy_reg 38 39import numpy as np 40 41from pyspark import since 42from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ 43 IntegerType, ByteType, BooleanType 44 45 46__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', 47 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices'] 48 49 50if sys.version_info[:2] == (2, 7): 51 # speed up pickling array in Python 2.7 52 def fast_pickle_array(ar): 53 return array.array, (ar.typecode, ar.tostring()) 54 copy_reg.pickle(array.array, fast_pickle_array) 55 56 57# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, 58# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices. 59 60try: 61 import scipy.sparse 62 _have_scipy = True 63except: 64 # No SciPy in environment, but that's okay 65 _have_scipy = False 66 67 68def _convert_to_vector(l): 69 if isinstance(l, Vector): 70 return l 71 elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange): 72 return DenseVector(l) 73 elif _have_scipy and scipy.sparse.issparse(l): 74 assert l.shape[1] == 1, "Expected column vector" 75 # Make sure the converted csc_matrix has sorted indices. 76 csc = l.tocsc() 77 if not csc.has_sorted_indices: 78 csc.sort_indices() 79 return SparseVector(l.shape[0], csc.indices, csc.data) 80 else: 81 raise TypeError("Cannot convert type %s into Vector" % type(l)) 82 83 84def _vector_size(v): 85 """ 86 Returns the size of the vector. 87 88 >>> _vector_size([1., 2., 3.]) 89 3 90 >>> _vector_size((1., 2., 3.)) 91 3 92 >>> _vector_size(array.array('d', [1., 2., 3.])) 93 3 94 >>> _vector_size(np.zeros(3)) 95 3 96 >>> _vector_size(np.zeros((3, 1))) 97 3 98 >>> _vector_size(np.zeros((1, 3))) 99 Traceback (most recent call last): 100 ... 101 ValueError: Cannot treat an ndarray of shape (1, 3) as a vector 102 """ 103 if isinstance(v, Vector): 104 return len(v) 105 elif type(v) in (array.array, list, tuple, xrange): 106 return len(v) 107 elif type(v) == np.ndarray: 108 if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): 109 return len(v) 110 else: 111 raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape)) 112 elif _have_scipy and scipy.sparse.issparse(v): 113 assert v.shape[1] == 1, "Expected column vector" 114 return v.shape[0] 115 else: 116 raise TypeError("Cannot treat type %s as a vector" % type(v)) 117 118 119def _format_float(f, digits=4): 120 s = str(round(f, digits)) 121 if '.' in s: 122 s = s[:s.index('.') + 1 + digits] 123 return s 124 125 126def _format_float_list(l): 127 return [_format_float(x) for x in l] 128 129 130def _double_to_long_bits(value): 131 if np.isnan(value): 132 value = float('nan') 133 # pack double into 64 bits, then unpack as long int 134 return struct.unpack('Q', struct.pack('d', value))[0] 135 136 137class VectorUDT(UserDefinedType): 138 """ 139 SQL user-defined type (UDT) for Vector. 140 """ 141 142 @classmethod 143 def sqlType(cls): 144 return StructType([ 145 StructField("type", ByteType(), False), 146 StructField("size", IntegerType(), True), 147 StructField("indices", ArrayType(IntegerType(), False), True), 148 StructField("values", ArrayType(DoubleType(), False), True)]) 149 150 @classmethod 151 def module(cls): 152 return "pyspark.ml.linalg" 153 154 @classmethod 155 def scalaUDT(cls): 156 return "org.apache.spark.ml.linalg.VectorUDT" 157 158 def serialize(self, obj): 159 if isinstance(obj, SparseVector): 160 indices = [int(i) for i in obj.indices] 161 values = [float(v) for v in obj.values] 162 return (0, obj.size, indices, values) 163 elif isinstance(obj, DenseVector): 164 values = [float(v) for v in obj] 165 return (1, None, None, values) 166 else: 167 raise TypeError("cannot serialize %r of type %r" % (obj, type(obj))) 168 169 def deserialize(self, datum): 170 assert len(datum) == 4, \ 171 "VectorUDT.deserialize given row with length %d but requires 4" % len(datum) 172 tpe = datum[0] 173 if tpe == 0: 174 return SparseVector(datum[1], datum[2], datum[3]) 175 elif tpe == 1: 176 return DenseVector(datum[3]) 177 else: 178 raise ValueError("do not recognize type %r" % tpe) 179 180 def simpleString(self): 181 return "vector" 182 183 184class MatrixUDT(UserDefinedType): 185 """ 186 SQL user-defined type (UDT) for Matrix. 187 """ 188 189 @classmethod 190 def sqlType(cls): 191 return StructType([ 192 StructField("type", ByteType(), False), 193 StructField("numRows", IntegerType(), False), 194 StructField("numCols", IntegerType(), False), 195 StructField("colPtrs", ArrayType(IntegerType(), False), True), 196 StructField("rowIndices", ArrayType(IntegerType(), False), True), 197 StructField("values", ArrayType(DoubleType(), False), True), 198 StructField("isTransposed", BooleanType(), False)]) 199 200 @classmethod 201 def module(cls): 202 return "pyspark.ml.linalg" 203 204 @classmethod 205 def scalaUDT(cls): 206 return "org.apache.spark.ml.linalg.MatrixUDT" 207 208 def serialize(self, obj): 209 if isinstance(obj, SparseMatrix): 210 colPtrs = [int(i) for i in obj.colPtrs] 211 rowIndices = [int(i) for i in obj.rowIndices] 212 values = [float(v) for v in obj.values] 213 return (0, obj.numRows, obj.numCols, colPtrs, 214 rowIndices, values, bool(obj.isTransposed)) 215 elif isinstance(obj, DenseMatrix): 216 values = [float(v) for v in obj.values] 217 return (1, obj.numRows, obj.numCols, None, None, values, 218 bool(obj.isTransposed)) 219 else: 220 raise TypeError("cannot serialize type %r" % (type(obj))) 221 222 def deserialize(self, datum): 223 assert len(datum) == 7, \ 224 "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum) 225 tpe = datum[0] 226 if tpe == 0: 227 return SparseMatrix(*datum[1:]) 228 elif tpe == 1: 229 return DenseMatrix(datum[1], datum[2], datum[5], datum[6]) 230 else: 231 raise ValueError("do not recognize type %r" % tpe) 232 233 def simpleString(self): 234 return "matrix" 235 236 237class Vector(object): 238 239 __UDT__ = VectorUDT() 240 241 """ 242 Abstract class for DenseVector and SparseVector 243 """ 244 def toArray(self): 245 """ 246 Convert the vector into an numpy.ndarray 247 248 :return: numpy.ndarray 249 """ 250 raise NotImplementedError 251 252 253class DenseVector(Vector): 254 """ 255 A dense vector represented by a value array. We use numpy array for 256 storage and arithmetics will be delegated to the underlying numpy 257 array. 258 259 >>> v = Vectors.dense([1.0, 2.0]) 260 >>> u = Vectors.dense([3.0, 4.0]) 261 >>> v + u 262 DenseVector([4.0, 6.0]) 263 >>> 2 - v 264 DenseVector([1.0, 0.0]) 265 >>> v / 2 266 DenseVector([0.5, 1.0]) 267 >>> v * u 268 DenseVector([3.0, 8.0]) 269 >>> u / v 270 DenseVector([3.0, 2.0]) 271 >>> u % 2 272 DenseVector([1.0, 0.0]) 273 """ 274 def __init__(self, ar): 275 if isinstance(ar, bytes): 276 ar = np.frombuffer(ar, dtype=np.float64) 277 elif not isinstance(ar, np.ndarray): 278 ar = np.array(ar, dtype=np.float64) 279 if ar.dtype != np.float64: 280 ar = ar.astype(np.float64) 281 self.array = ar 282 283 def __reduce__(self): 284 return DenseVector, (self.array.tostring(),) 285 286 def numNonzeros(self): 287 """ 288 Number of nonzero elements. This scans all active values and count non zeros 289 """ 290 return np.count_nonzero(self.array) 291 292 def norm(self, p): 293 """ 294 Calculates the norm of a DenseVector. 295 296 >>> a = DenseVector([0, -1, 2, -3]) 297 >>> a.norm(2) 298 3.7... 299 >>> a.norm(1) 300 6.0 301 """ 302 return np.linalg.norm(self.array, p) 303 304 def dot(self, other): 305 """ 306 Compute the dot product of two Vectors. We support 307 (Numpy array, list, SparseVector, or SciPy sparse) 308 and a target NumPy array that is either 1- or 2-dimensional. 309 Equivalent to calling numpy.dot of the two vectors. 310 311 >>> dense = DenseVector(array.array('d', [1., 2.])) 312 >>> dense.dot(dense) 313 5.0 314 >>> dense.dot(SparseVector(2, [0, 1], [2., 1.])) 315 4.0 316 >>> dense.dot(range(1, 3)) 317 5.0 318 >>> dense.dot(np.array(range(1, 3))) 319 5.0 320 >>> dense.dot([1.,]) 321 Traceback (most recent call last): 322 ... 323 AssertionError: dimension mismatch 324 >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F')) 325 array([ 5., 11.]) 326 >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F')) 327 Traceback (most recent call last): 328 ... 329 AssertionError: dimension mismatch 330 """ 331 if type(other) == np.ndarray: 332 if other.ndim > 1: 333 assert len(self) == other.shape[0], "dimension mismatch" 334 return np.dot(self.array, other) 335 elif _have_scipy and scipy.sparse.issparse(other): 336 assert len(self) == other.shape[0], "dimension mismatch" 337 return other.transpose().dot(self.toArray()) 338 else: 339 assert len(self) == _vector_size(other), "dimension mismatch" 340 if isinstance(other, SparseVector): 341 return other.dot(self) 342 elif isinstance(other, Vector): 343 return np.dot(self.toArray(), other.toArray()) 344 else: 345 return np.dot(self.toArray(), other) 346 347 def squared_distance(self, other): 348 """ 349 Squared distance of two Vectors. 350 351 >>> dense1 = DenseVector(array.array('d', [1., 2.])) 352 >>> dense1.squared_distance(dense1) 353 0.0 354 >>> dense2 = np.array([2., 1.]) 355 >>> dense1.squared_distance(dense2) 356 2.0 357 >>> dense3 = [2., 1.] 358 >>> dense1.squared_distance(dense3) 359 2.0 360 >>> sparse1 = SparseVector(2, [0, 1], [2., 1.]) 361 >>> dense1.squared_distance(sparse1) 362 2.0 363 >>> dense1.squared_distance([1.,]) 364 Traceback (most recent call last): 365 ... 366 AssertionError: dimension mismatch 367 >>> dense1.squared_distance(SparseVector(1, [0,], [1.,])) 368 Traceback (most recent call last): 369 ... 370 AssertionError: dimension mismatch 371 """ 372 assert len(self) == _vector_size(other), "dimension mismatch" 373 if isinstance(other, SparseVector): 374 return other.squared_distance(self) 375 elif _have_scipy and scipy.sparse.issparse(other): 376 return _convert_to_vector(other).squared_distance(self) 377 378 if isinstance(other, Vector): 379 other = other.toArray() 380 elif not isinstance(other, np.ndarray): 381 other = np.array(other) 382 diff = self.toArray() - other 383 return np.dot(diff, diff) 384 385 def toArray(self): 386 """ 387 Returns an numpy.ndarray 388 """ 389 return self.array 390 391 @property 392 def values(self): 393 """ 394 Returns a list of values 395 """ 396 return self.array 397 398 def __getitem__(self, item): 399 return self.array[item] 400 401 def __len__(self): 402 return len(self.array) 403 404 def __str__(self): 405 return "[" + ",".join([str(v) for v in self.array]) + "]" 406 407 def __repr__(self): 408 return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array)) 409 410 def __eq__(self, other): 411 if isinstance(other, DenseVector): 412 return np.array_equal(self.array, other.array) 413 elif isinstance(other, SparseVector): 414 if len(self) != other.size: 415 return False 416 return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values) 417 return False 418 419 def __ne__(self, other): 420 return not self == other 421 422 def __hash__(self): 423 size = len(self) 424 result = 31 + size 425 nnz = 0 426 i = 0 427 while i < size and nnz < 128: 428 if self.array[i] != 0: 429 result = 31 * result + i 430 bits = _double_to_long_bits(self.array[i]) 431 result = 31 * result + (bits ^ (bits >> 32)) 432 nnz += 1 433 i += 1 434 return result 435 436 def __getattr__(self, item): 437 return getattr(self.array, item) 438 439 def _delegate(op): 440 def func(self, other): 441 if isinstance(other, DenseVector): 442 other = other.array 443 return DenseVector(getattr(self.array, op)(other)) 444 return func 445 446 __neg__ = _delegate("__neg__") 447 __add__ = _delegate("__add__") 448 __sub__ = _delegate("__sub__") 449 __mul__ = _delegate("__mul__") 450 __div__ = _delegate("__div__") 451 __truediv__ = _delegate("__truediv__") 452 __mod__ = _delegate("__mod__") 453 __radd__ = _delegate("__radd__") 454 __rsub__ = _delegate("__rsub__") 455 __rmul__ = _delegate("__rmul__") 456 __rdiv__ = _delegate("__rdiv__") 457 __rtruediv__ = _delegate("__rtruediv__") 458 __rmod__ = _delegate("__rmod__") 459 460 461class SparseVector(Vector): 462 """ 463 A simple sparse vector class for passing data to MLlib. Users may 464 alternatively pass SciPy's {scipy.sparse} data types. 465 """ 466 def __init__(self, size, *args): 467 """ 468 Create a sparse vector, using either a dictionary, a list of 469 (index, value) pairs, or two separate arrays of indices and 470 values (sorted by index). 471 472 :param size: Size of the vector. 473 :param args: Active entries, as a dictionary {index: value, ...}, 474 a list of tuples [(index, value), ...], or a list of strictly 475 increasing indices and a list of corresponding values [index, ...], 476 [value, ...]. Inactive entries are treated as zeros. 477 478 >>> SparseVector(4, {1: 1.0, 3: 5.5}) 479 SparseVector(4, {1: 1.0, 3: 5.5}) 480 >>> SparseVector(4, [(1, 1.0), (3, 5.5)]) 481 SparseVector(4, {1: 1.0, 3: 5.5}) 482 >>> SparseVector(4, [1, 3], [1.0, 5.5]) 483 SparseVector(4, {1: 1.0, 3: 5.5}) 484 >>> SparseVector(4, {1:1.0, 6:2.0}) 485 Traceback (most recent call last): 486 ... 487 AssertionError: Index 6 is out of the the size of vector with size=4 488 >>> SparseVector(4, {-1:1.0}) 489 Traceback (most recent call last): 490 ... 491 AssertionError: Contains negative index -1 492 """ 493 self.size = int(size) 494 """ Size of the vector. """ 495 assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments" 496 if len(args) == 1: 497 pairs = args[0] 498 if type(pairs) == dict: 499 pairs = pairs.items() 500 pairs = sorted(pairs) 501 self.indices = np.array([p[0] for p in pairs], dtype=np.int32) 502 """ A list of indices corresponding to active entries. """ 503 self.values = np.array([p[1] for p in pairs], dtype=np.float64) 504 """ A list of values corresponding to active entries. """ 505 else: 506 if isinstance(args[0], bytes): 507 assert isinstance(args[1], bytes), "values should be string too" 508 if args[0]: 509 self.indices = np.frombuffer(args[0], np.int32) 510 self.values = np.frombuffer(args[1], np.float64) 511 else: 512 # np.frombuffer() doesn't work well with empty string in older version 513 self.indices = np.array([], dtype=np.int32) 514 self.values = np.array([], dtype=np.float64) 515 else: 516 self.indices = np.array(args[0], dtype=np.int32) 517 self.values = np.array(args[1], dtype=np.float64) 518 assert len(self.indices) == len(self.values), "index and value arrays not same length" 519 for i in xrange(len(self.indices) - 1): 520 if self.indices[i] >= self.indices[i + 1]: 521 raise TypeError( 522 "Indices %s and %s are not strictly increasing" 523 % (self.indices[i], self.indices[i + 1])) 524 525 if self.indices.size > 0: 526 assert np.max(self.indices) < self.size, \ 527 "Index %d is out of the the size of vector with size=%d" \ 528 % (np.max(self.indices), self.size) 529 assert np.min(self.indices) >= 0, \ 530 "Contains negative index %d" % (np.min(self.indices)) 531 532 def numNonzeros(self): 533 """ 534 Number of nonzero elements. This scans all active values and count non zeros. 535 """ 536 return np.count_nonzero(self.values) 537 538 def norm(self, p): 539 """ 540 Calculates the norm of a SparseVector. 541 542 >>> a = SparseVector(4, [0, 1], [3., -4.]) 543 >>> a.norm(1) 544 7.0 545 >>> a.norm(2) 546 5.0 547 """ 548 return np.linalg.norm(self.values, p) 549 550 def __reduce__(self): 551 return ( 552 SparseVector, 553 (self.size, self.indices.tostring(), self.values.tostring())) 554 555 def dot(self, other): 556 """ 557 Dot product with a SparseVector or 1- or 2-dimensional Numpy array. 558 559 >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 560 >>> a.dot(a) 561 25.0 562 >>> a.dot(array.array('d', [1., 2., 3., 4.])) 563 22.0 564 >>> b = SparseVector(4, [2], [1.0]) 565 >>> a.dot(b) 566 0.0 567 >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]])) 568 array([ 22., 22.]) 569 >>> a.dot([1., 2., 3.]) 570 Traceback (most recent call last): 571 ... 572 AssertionError: dimension mismatch 573 >>> a.dot(np.array([1., 2.])) 574 Traceback (most recent call last): 575 ... 576 AssertionError: dimension mismatch 577 >>> a.dot(DenseVector([1., 2.])) 578 Traceback (most recent call last): 579 ... 580 AssertionError: dimension mismatch 581 >>> a.dot(np.zeros((3, 2))) 582 Traceback (most recent call last): 583 ... 584 AssertionError: dimension mismatch 585 """ 586 587 if isinstance(other, np.ndarray): 588 if other.ndim not in [2, 1]: 589 raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim) 590 assert len(self) == other.shape[0], "dimension mismatch" 591 return np.dot(self.values, other[self.indices]) 592 593 assert len(self) == _vector_size(other), "dimension mismatch" 594 595 if isinstance(other, DenseVector): 596 return np.dot(other.array[self.indices], self.values) 597 598 elif isinstance(other, SparseVector): 599 # Find out common indices. 600 self_cmind = np.in1d(self.indices, other.indices, assume_unique=True) 601 self_values = self.values[self_cmind] 602 if self_values.size == 0: 603 return 0.0 604 else: 605 other_cmind = np.in1d(other.indices, self.indices, assume_unique=True) 606 return np.dot(self_values, other.values[other_cmind]) 607 608 else: 609 return self.dot(_convert_to_vector(other)) 610 611 def squared_distance(self, other): 612 """ 613 Squared distance from a SparseVector or 1-dimensional NumPy array. 614 615 >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 616 >>> a.squared_distance(a) 617 0.0 618 >>> a.squared_distance(array.array('d', [1., 2., 3., 4.])) 619 11.0 620 >>> a.squared_distance(np.array([1., 2., 3., 4.])) 621 11.0 622 >>> b = SparseVector(4, [2], [1.0]) 623 >>> a.squared_distance(b) 624 26.0 625 >>> b.squared_distance(a) 626 26.0 627 >>> b.squared_distance([1., 2.]) 628 Traceback (most recent call last): 629 ... 630 AssertionError: dimension mismatch 631 >>> b.squared_distance(SparseVector(3, [1,], [1.0,])) 632 Traceback (most recent call last): 633 ... 634 AssertionError: dimension mismatch 635 """ 636 assert len(self) == _vector_size(other), "dimension mismatch" 637 638 if isinstance(other, np.ndarray) or isinstance(other, DenseVector): 639 if isinstance(other, np.ndarray) and other.ndim != 1: 640 raise Exception("Cannot call squared_distance with %d-dimensional array" % 641 other.ndim) 642 if isinstance(other, DenseVector): 643 other = other.array 644 sparse_ind = np.zeros(other.size, dtype=bool) 645 sparse_ind[self.indices] = True 646 dist = other[sparse_ind] - self.values 647 result = np.dot(dist, dist) 648 649 other_ind = other[~sparse_ind] 650 result += np.dot(other_ind, other_ind) 651 return result 652 653 elif isinstance(other, SparseVector): 654 result = 0.0 655 i, j = 0, 0 656 while i < len(self.indices) and j < len(other.indices): 657 if self.indices[i] == other.indices[j]: 658 diff = self.values[i] - other.values[j] 659 result += diff * diff 660 i += 1 661 j += 1 662 elif self.indices[i] < other.indices[j]: 663 result += self.values[i] * self.values[i] 664 i += 1 665 else: 666 result += other.values[j] * other.values[j] 667 j += 1 668 while i < len(self.indices): 669 result += self.values[i] * self.values[i] 670 i += 1 671 while j < len(other.indices): 672 result += other.values[j] * other.values[j] 673 j += 1 674 return result 675 else: 676 return self.squared_distance(_convert_to_vector(other)) 677 678 def toArray(self): 679 """ 680 Returns a copy of this SparseVector as a 1-dimensional NumPy array. 681 """ 682 arr = np.zeros((self.size,), dtype=np.float64) 683 arr[self.indices] = self.values 684 return arr 685 686 def __len__(self): 687 return self.size 688 689 def __str__(self): 690 inds = "[" + ",".join([str(i) for i in self.indices]) + "]" 691 vals = "[" + ",".join([str(v) for v in self.values]) + "]" 692 return "(" + ",".join((str(self.size), inds, vals)) + ")" 693 694 def __repr__(self): 695 inds = self.indices 696 vals = self.values 697 entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i])) 698 for i in xrange(len(inds))]) 699 return "SparseVector({0}, {{{1}}})".format(self.size, entries) 700 701 def __eq__(self, other): 702 if isinstance(other, SparseVector): 703 return other.size == self.size and np.array_equal(other.indices, self.indices) \ 704 and np.array_equal(other.values, self.values) 705 elif isinstance(other, DenseVector): 706 if self.size != len(other): 707 return False 708 return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array) 709 return False 710 711 def __getitem__(self, index): 712 inds = self.indices 713 vals = self.values 714 if not isinstance(index, int): 715 raise TypeError( 716 "Indices must be of type integer, got type %s" % type(index)) 717 718 if index >= self.size or index < -self.size: 719 raise IndexError("Index %d out of bounds." % index) 720 if index < 0: 721 index += self.size 722 723 if (inds.size == 0) or (index > inds.item(-1)): 724 return 0. 725 726 insert_index = np.searchsorted(inds, index) 727 row_ind = inds[insert_index] 728 if row_ind == index: 729 return vals[insert_index] 730 return 0. 731 732 def __ne__(self, other): 733 return not self.__eq__(other) 734 735 def __hash__(self): 736 result = 31 + self.size 737 nnz = 0 738 i = 0 739 while i < len(self.values) and nnz < 128: 740 if self.values[i] != 0: 741 result = 31 * result + int(self.indices[i]) 742 bits = _double_to_long_bits(self.values[i]) 743 result = 31 * result + (bits ^ (bits >> 32)) 744 nnz += 1 745 i += 1 746 return result 747 748 749class Vectors(object): 750 751 """ 752 Factory methods for working with vectors. 753 754 .. note:: Dense vectors are simply represented as NumPy array objects, 755 so there is no need to covert them for use in MLlib. For sparse vectors, 756 the factory methods in this class create an MLlib-compatible type, or users 757 can pass in SciPy's C{scipy.sparse} column vectors. 758 """ 759 760 @staticmethod 761 def sparse(size, *args): 762 """ 763 Create a sparse vector, using either a dictionary, a list of 764 (index, value) pairs, or two separate arrays of indices and 765 values (sorted by index). 766 767 :param size: Size of the vector. 768 :param args: Non-zero entries, as a dictionary, list of tuples, 769 or two sorted lists containing indices and values. 770 771 >>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) 772 SparseVector(4, {1: 1.0, 3: 5.5}) 773 >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) 774 SparseVector(4, {1: 1.0, 3: 5.5}) 775 >>> Vectors.sparse(4, [1, 3], [1.0, 5.5]) 776 SparseVector(4, {1: 1.0, 3: 5.5}) 777 """ 778 return SparseVector(size, *args) 779 780 @staticmethod 781 def dense(*elements): 782 """ 783 Create a dense vector of 64-bit floats from a Python list or numbers. 784 785 >>> Vectors.dense([1, 2, 3]) 786 DenseVector([1.0, 2.0, 3.0]) 787 >>> Vectors.dense(1.0, 2.0) 788 DenseVector([1.0, 2.0]) 789 """ 790 if len(elements) == 1 and not isinstance(elements[0], (float, int, long)): 791 # it's list, numpy.array or other iterable object. 792 elements = elements[0] 793 return DenseVector(elements) 794 795 @staticmethod 796 def squared_distance(v1, v2): 797 """ 798 Squared distance between two vectors. 799 a and b can be of type SparseVector, DenseVector, np.ndarray 800 or array.array. 801 802 >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) 803 >>> b = Vectors.dense([2, 5, 4, 1]) 804 >>> a.squared_distance(b) 805 51.0 806 """ 807 v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2) 808 return v1.squared_distance(v2) 809 810 @staticmethod 811 def norm(vector, p): 812 """ 813 Find norm of the given vector. 814 """ 815 return _convert_to_vector(vector).norm(p) 816 817 @staticmethod 818 def zeros(size): 819 return DenseVector(np.zeros(size)) 820 821 @staticmethod 822 def _equals(v1_indices, v1_values, v2_indices, v2_values): 823 """ 824 Check equality between sparse/dense vectors, 825 v1_indices and v2_indices assume to be strictly increasing. 826 """ 827 v1_size = len(v1_values) 828 v2_size = len(v2_values) 829 k1 = 0 830 k2 = 0 831 all_equal = True 832 while all_equal: 833 while k1 < v1_size and v1_values[k1] == 0: 834 k1 += 1 835 while k2 < v2_size and v2_values[k2] == 0: 836 k2 += 1 837 838 if k1 >= v1_size or k2 >= v2_size: 839 return k1 >= v1_size and k2 >= v2_size 840 841 all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2] 842 k1 += 1 843 k2 += 1 844 return all_equal 845 846 847class Matrix(object): 848 849 __UDT__ = MatrixUDT() 850 851 """ 852 Represents a local matrix. 853 """ 854 def __init__(self, numRows, numCols, isTransposed=False): 855 self.numRows = numRows 856 self.numCols = numCols 857 self.isTransposed = isTransposed 858 859 def toArray(self): 860 """ 861 Returns its elements in a NumPy ndarray. 862 """ 863 raise NotImplementedError 864 865 @staticmethod 866 def _convert_to_array(array_like, dtype): 867 """ 868 Convert Matrix attributes which are array-like or buffer to array. 869 """ 870 if isinstance(array_like, bytes): 871 return np.frombuffer(array_like, dtype=dtype) 872 return np.asarray(array_like, dtype=dtype) 873 874 875class DenseMatrix(Matrix): 876 """ 877 Column-major dense matrix. 878 """ 879 def __init__(self, numRows, numCols, values, isTransposed=False): 880 Matrix.__init__(self, numRows, numCols, isTransposed) 881 values = self._convert_to_array(values, np.float64) 882 assert len(values) == numRows * numCols 883 self.values = values 884 885 def __reduce__(self): 886 return DenseMatrix, ( 887 self.numRows, self.numCols, self.values.tostring(), 888 int(self.isTransposed)) 889 890 def __str__(self): 891 """ 892 Pretty printing of a DenseMatrix 893 894 >>> dm = DenseMatrix(2, 2, range(4)) 895 >>> print(dm) 896 DenseMatrix([[ 0., 2.], 897 [ 1., 3.]]) 898 >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True) 899 >>> print(dm) 900 DenseMatrix([[ 0., 1.], 901 [ 2., 3.]]) 902 """ 903 # Inspired by __repr__ in scipy matrices. 904 array_lines = repr(self.toArray()).splitlines() 905 906 # We need to adjust six spaces which is the difference in number 907 # of letters between "DenseMatrix" and "array" 908 x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]]) 909 return array_lines[0].replace("array", "DenseMatrix") + "\n" + x 910 911 def __repr__(self): 912 """ 913 Representation of a DenseMatrix 914 915 >>> dm = DenseMatrix(2, 2, range(4)) 916 >>> dm 917 DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) 918 """ 919 # If the number of values are less than seventeen then return as it is. 920 # Else return first eight values and last eight values. 921 if len(self.values) < 17: 922 entries = _format_float_list(self.values) 923 else: 924 entries = ( 925 _format_float_list(self.values[:8]) + 926 ["..."] + 927 _format_float_list(self.values[-8:]) 928 ) 929 930 entries = ", ".join(entries) 931 return "DenseMatrix({0}, {1}, [{2}], {3})".format( 932 self.numRows, self.numCols, entries, self.isTransposed) 933 934 def toArray(self): 935 """ 936 Return an numpy.ndarray 937 938 >>> m = DenseMatrix(2, 2, range(4)) 939 >>> m.toArray() 940 array([[ 0., 2.], 941 [ 1., 3.]]) 942 """ 943 if self.isTransposed: 944 return np.asfortranarray( 945 self.values.reshape((self.numRows, self.numCols))) 946 else: 947 return self.values.reshape((self.numRows, self.numCols), order='F') 948 949 def toSparse(self): 950 """Convert to SparseMatrix""" 951 if self.isTransposed: 952 values = np.ravel(self.toArray(), order='F') 953 else: 954 values = self.values 955 indices = np.nonzero(values)[0] 956 colCounts = np.bincount(indices // self.numRows) 957 colPtrs = np.cumsum(np.hstack( 958 (0, colCounts, np.zeros(self.numCols - colCounts.size)))) 959 values = values[indices] 960 rowIndices = indices % self.numRows 961 962 return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values) 963 964 def __getitem__(self, indices): 965 i, j = indices 966 if i < 0 or i >= self.numRows: 967 raise IndexError("Row index %d is out of range [0, %d)" 968 % (i, self.numRows)) 969 if j >= self.numCols or j < 0: 970 raise IndexError("Column index %d is out of range [0, %d)" 971 % (j, self.numCols)) 972 973 if self.isTransposed: 974 return self.values[i * self.numCols + j] 975 else: 976 return self.values[i + j * self.numRows] 977 978 def __eq__(self, other): 979 if (not isinstance(other, DenseMatrix) or 980 self.numRows != other.numRows or 981 self.numCols != other.numCols): 982 return False 983 984 self_values = np.ravel(self.toArray(), order='F') 985 other_values = np.ravel(other.toArray(), order='F') 986 return all(self_values == other_values) 987 988 989class SparseMatrix(Matrix): 990 """Sparse Matrix stored in CSC format.""" 991 def __init__(self, numRows, numCols, colPtrs, rowIndices, values, 992 isTransposed=False): 993 Matrix.__init__(self, numRows, numCols, isTransposed) 994 self.colPtrs = self._convert_to_array(colPtrs, np.int32) 995 self.rowIndices = self._convert_to_array(rowIndices, np.int32) 996 self.values = self._convert_to_array(values, np.float64) 997 998 if self.isTransposed: 999 if self.colPtrs.size != numRows + 1: 1000 raise ValueError("Expected colPtrs of size %d, got %d." 1001 % (numRows + 1, self.colPtrs.size)) 1002 else: 1003 if self.colPtrs.size != numCols + 1: 1004 raise ValueError("Expected colPtrs of size %d, got %d." 1005 % (numCols + 1, self.colPtrs.size)) 1006 if self.rowIndices.size != self.values.size: 1007 raise ValueError("Expected rowIndices of length %d, got %d." 1008 % (self.rowIndices.size, self.values.size)) 1009 1010 def __str__(self): 1011 """ 1012 Pretty printing of a SparseMatrix 1013 1014 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) 1015 >>> print(sm1) 1016 2 X 2 CSCMatrix 1017 (0,0) 2.0 1018 (1,0) 3.0 1019 (1,1) 4.0 1020 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) 1021 >>> print(sm1) 1022 2 X 2 CSRMatrix 1023 (0,0) 2.0 1024 (0,1) 3.0 1025 (1,1) 4.0 1026 """ 1027 spstr = "{0} X {1} ".format(self.numRows, self.numCols) 1028 if self.isTransposed: 1029 spstr += "CSRMatrix\n" 1030 else: 1031 spstr += "CSCMatrix\n" 1032 1033 cur_col = 0 1034 smlist = [] 1035 1036 # Display first 16 values. 1037 if len(self.values) <= 16: 1038 zipindval = zip(self.rowIndices, self.values) 1039 else: 1040 zipindval = zip(self.rowIndices[:16], self.values[:16]) 1041 for i, (rowInd, value) in enumerate(zipindval): 1042 if self.colPtrs[cur_col + 1] <= i: 1043 cur_col += 1 1044 if self.isTransposed: 1045 smlist.append('({0},{1}) {2}'.format( 1046 cur_col, rowInd, _format_float(value))) 1047 else: 1048 smlist.append('({0},{1}) {2}'.format( 1049 rowInd, cur_col, _format_float(value))) 1050 spstr += "\n".join(smlist) 1051 1052 if len(self.values) > 16: 1053 spstr += "\n.." * 2 1054 return spstr 1055 1056 def __repr__(self): 1057 """ 1058 Representation of a SparseMatrix 1059 1060 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) 1061 >>> sm1 1062 SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False) 1063 """ 1064 rowIndices = list(self.rowIndices) 1065 colPtrs = list(self.colPtrs) 1066 1067 if len(self.values) <= 16: 1068 values = _format_float_list(self.values) 1069 1070 else: 1071 values = ( 1072 _format_float_list(self.values[:8]) + 1073 ["..."] + 1074 _format_float_list(self.values[-8:]) 1075 ) 1076 rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:] 1077 1078 if len(self.colPtrs) > 16: 1079 colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:] 1080 1081 values = ", ".join(values) 1082 rowIndices = ", ".join([str(ind) for ind in rowIndices]) 1083 colPtrs = ", ".join([str(ptr) for ptr in colPtrs]) 1084 return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format( 1085 self.numRows, self.numCols, colPtrs, rowIndices, 1086 values, self.isTransposed) 1087 1088 def __reduce__(self): 1089 return SparseMatrix, ( 1090 self.numRows, self.numCols, self.colPtrs.tostring(), 1091 self.rowIndices.tostring(), self.values.tostring(), 1092 int(self.isTransposed)) 1093 1094 def __getitem__(self, indices): 1095 i, j = indices 1096 if i < 0 or i >= self.numRows: 1097 raise IndexError("Row index %d is out of range [0, %d)" 1098 % (i, self.numRows)) 1099 if j < 0 or j >= self.numCols: 1100 raise IndexError("Column index %d is out of range [0, %d)" 1101 % (j, self.numCols)) 1102 1103 # If a CSR matrix is given, then the row index should be searched 1104 # for in ColPtrs, and the column index should be searched for in the 1105 # corresponding slice obtained from rowIndices. 1106 if self.isTransposed: 1107 j, i = i, j 1108 1109 colStart = self.colPtrs[j] 1110 colEnd = self.colPtrs[j + 1] 1111 nz = self.rowIndices[colStart: colEnd] 1112 ind = np.searchsorted(nz, i) + colStart 1113 if ind < colEnd and self.rowIndices[ind] == i: 1114 return self.values[ind] 1115 else: 1116 return 0.0 1117 1118 def toArray(self): 1119 """ 1120 Return an numpy.ndarray 1121 """ 1122 A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F') 1123 for k in xrange(self.colPtrs.size - 1): 1124 startptr = self.colPtrs[k] 1125 endptr = self.colPtrs[k + 1] 1126 if self.isTransposed: 1127 A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr] 1128 else: 1129 A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr] 1130 return A 1131 1132 def toDense(self): 1133 densevals = np.ravel(self.toArray(), order='F') 1134 return DenseMatrix(self.numRows, self.numCols, densevals) 1135 1136 # TODO: More efficient implementation: 1137 def __eq__(self, other): 1138 return np.all(self.toArray() == other.toArray()) 1139 1140 1141class Matrices(object): 1142 @staticmethod 1143 def dense(numRows, numCols, values): 1144 """ 1145 Create a DenseMatrix 1146 """ 1147 return DenseMatrix(numRows, numCols, values) 1148 1149 @staticmethod 1150 def sparse(numRows, numCols, colPtrs, rowIndices, values): 1151 """ 1152 Create a SparseMatrix 1153 """ 1154 return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values) 1155 1156 1157def _test(): 1158 import doctest 1159 (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS) 1160 if failure_count: 1161 exit(-1) 1162 1163if __name__ == "__main__": 1164 _test() 1165