1# 2# Licensed to the Apache Software Foundation (ASF) under one or more 3# contributor license agreements. See the NOTICE file distributed with 4# this work for additional information regarding copyright ownership. 5# The ASF licenses this file to You under the Apache License, Version 2.0 6# (the "License"); you may not use this file except in compliance with 7# the License. You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18""" 19MLlib utilities for linear algebra. For dense vectors, MLlib 20uses the NumPy C{array} type, so you can simply pass NumPy arrays 21around. For sparse vectors, users can construct a L{SparseVector} 22object from MLlib or pass SciPy C{scipy.sparse} column vectors if 23SciPy is available in their environment. 24""" 25 26import sys 27import array 28import struct 29 30if sys.version >= '3': 31 basestring = str 32 xrange = range 33 import copyreg as copy_reg 34 long = int 35else: 36 from itertools import izip as zip 37 import copy_reg 38 39import numpy as np 40 41from pyspark import since 42from pyspark.ml import linalg as newlinalg 43from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \ 44 IntegerType, ByteType, BooleanType 45 46 47__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors', 48 'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices', 49 'QRDecomposition'] 50 51 52if sys.version_info[:2] == (2, 7): 53 # speed up pickling array in Python 2.7 54 def fast_pickle_array(ar): 55 return array.array, (ar.typecode, ar.tostring()) 56 copy_reg.pickle(array.array, fast_pickle_array) 57 58 59# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods, 60# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices. 61 62try: 63 import scipy.sparse 64 _have_scipy = True 65except: 66 # No SciPy in environment, but that's okay 67 _have_scipy = False 68 69 70def _convert_to_vector(l): 71 if isinstance(l, Vector): 72 return l 73 elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange): 74 return DenseVector(l) 75 elif _have_scipy and scipy.sparse.issparse(l): 76 assert l.shape[1] == 1, "Expected column vector" 77 # Make sure the converted csc_matrix has sorted indices. 78 csc = l.tocsc() 79 if not csc.has_sorted_indices: 80 csc.sort_indices() 81 return SparseVector(l.shape[0], csc.indices, csc.data) 82 else: 83 raise TypeError("Cannot convert type %s into Vector" % type(l)) 84 85 86def _vector_size(v): 87 """ 88 Returns the size of the vector. 89 90 >>> _vector_size([1., 2., 3.]) 91 3 92 >>> _vector_size((1., 2., 3.)) 93 3 94 >>> _vector_size(array.array('d', [1., 2., 3.])) 95 3 96 >>> _vector_size(np.zeros(3)) 97 3 98 >>> _vector_size(np.zeros((3, 1))) 99 3 100 >>> _vector_size(np.zeros((1, 3))) 101 Traceback (most recent call last): 102 ... 103 ValueError: Cannot treat an ndarray of shape (1, 3) as a vector 104 """ 105 if isinstance(v, Vector): 106 return len(v) 107 elif type(v) in (array.array, list, tuple, xrange): 108 return len(v) 109 elif type(v) == np.ndarray: 110 if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1): 111 return len(v) 112 else: 113 raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape)) 114 elif _have_scipy and scipy.sparse.issparse(v): 115 assert v.shape[1] == 1, "Expected column vector" 116 return v.shape[0] 117 else: 118 raise TypeError("Cannot treat type %s as a vector" % type(v)) 119 120 121def _format_float(f, digits=4): 122 s = str(round(f, digits)) 123 if '.' in s: 124 s = s[:s.index('.') + 1 + digits] 125 return s 126 127 128def _format_float_list(l): 129 return [_format_float(x) for x in l] 130 131 132def _double_to_long_bits(value): 133 if np.isnan(value): 134 value = float('nan') 135 # pack double into 64 bits, then unpack as long int 136 return struct.unpack('Q', struct.pack('d', value))[0] 137 138 139class VectorUDT(UserDefinedType): 140 """ 141 SQL user-defined type (UDT) for Vector. 142 """ 143 144 @classmethod 145 def sqlType(cls): 146 return StructType([ 147 StructField("type", ByteType(), False), 148 StructField("size", IntegerType(), True), 149 StructField("indices", ArrayType(IntegerType(), False), True), 150 StructField("values", ArrayType(DoubleType(), False), True)]) 151 152 @classmethod 153 def module(cls): 154 return "pyspark.mllib.linalg" 155 156 @classmethod 157 def scalaUDT(cls): 158 return "org.apache.spark.mllib.linalg.VectorUDT" 159 160 def serialize(self, obj): 161 if isinstance(obj, SparseVector): 162 indices = [int(i) for i in obj.indices] 163 values = [float(v) for v in obj.values] 164 return (0, obj.size, indices, values) 165 elif isinstance(obj, DenseVector): 166 values = [float(v) for v in obj] 167 return (1, None, None, values) 168 else: 169 raise TypeError("cannot serialize %r of type %r" % (obj, type(obj))) 170 171 def deserialize(self, datum): 172 assert len(datum) == 4, \ 173 "VectorUDT.deserialize given row with length %d but requires 4" % len(datum) 174 tpe = datum[0] 175 if tpe == 0: 176 return SparseVector(datum[1], datum[2], datum[3]) 177 elif tpe == 1: 178 return DenseVector(datum[3]) 179 else: 180 raise ValueError("do not recognize type %r" % tpe) 181 182 def simpleString(self): 183 return "vector" 184 185 186class MatrixUDT(UserDefinedType): 187 """ 188 SQL user-defined type (UDT) for Matrix. 189 """ 190 191 @classmethod 192 def sqlType(cls): 193 return StructType([ 194 StructField("type", ByteType(), False), 195 StructField("numRows", IntegerType(), False), 196 StructField("numCols", IntegerType(), False), 197 StructField("colPtrs", ArrayType(IntegerType(), False), True), 198 StructField("rowIndices", ArrayType(IntegerType(), False), True), 199 StructField("values", ArrayType(DoubleType(), False), True), 200 StructField("isTransposed", BooleanType(), False)]) 201 202 @classmethod 203 def module(cls): 204 return "pyspark.mllib.linalg" 205 206 @classmethod 207 def scalaUDT(cls): 208 return "org.apache.spark.mllib.linalg.MatrixUDT" 209 210 def serialize(self, obj): 211 if isinstance(obj, SparseMatrix): 212 colPtrs = [int(i) for i in obj.colPtrs] 213 rowIndices = [int(i) for i in obj.rowIndices] 214 values = [float(v) for v in obj.values] 215 return (0, obj.numRows, obj.numCols, colPtrs, 216 rowIndices, values, bool(obj.isTransposed)) 217 elif isinstance(obj, DenseMatrix): 218 values = [float(v) for v in obj.values] 219 return (1, obj.numRows, obj.numCols, None, None, values, 220 bool(obj.isTransposed)) 221 else: 222 raise TypeError("cannot serialize type %r" % (type(obj))) 223 224 def deserialize(self, datum): 225 assert len(datum) == 7, \ 226 "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum) 227 tpe = datum[0] 228 if tpe == 0: 229 return SparseMatrix(*datum[1:]) 230 elif tpe == 1: 231 return DenseMatrix(datum[1], datum[2], datum[5], datum[6]) 232 else: 233 raise ValueError("do not recognize type %r" % tpe) 234 235 def simpleString(self): 236 return "matrix" 237 238 239class Vector(object): 240 241 __UDT__ = VectorUDT() 242 243 """ 244 Abstract class for DenseVector and SparseVector 245 """ 246 def toArray(self): 247 """ 248 Convert the vector into an numpy.ndarray 249 250 :return: numpy.ndarray 251 """ 252 raise NotImplementedError 253 254 def asML(self): 255 """ 256 Convert this vector to the new mllib-local representation. 257 This does NOT copy the data; it copies references. 258 259 :return: :py:class:`pyspark.ml.linalg.Vector` 260 """ 261 raise NotImplementedError 262 263 264class DenseVector(Vector): 265 """ 266 A dense vector represented by a value array. We use numpy array for 267 storage and arithmetics will be delegated to the underlying numpy 268 array. 269 270 >>> v = Vectors.dense([1.0, 2.0]) 271 >>> u = Vectors.dense([3.0, 4.0]) 272 >>> v + u 273 DenseVector([4.0, 6.0]) 274 >>> 2 - v 275 DenseVector([1.0, 0.0]) 276 >>> v / 2 277 DenseVector([0.5, 1.0]) 278 >>> v * u 279 DenseVector([3.0, 8.0]) 280 >>> u / v 281 DenseVector([3.0, 2.0]) 282 >>> u % 2 283 DenseVector([1.0, 0.0]) 284 """ 285 def __init__(self, ar): 286 if isinstance(ar, bytes): 287 ar = np.frombuffer(ar, dtype=np.float64) 288 elif not isinstance(ar, np.ndarray): 289 ar = np.array(ar, dtype=np.float64) 290 if ar.dtype != np.float64: 291 ar = ar.astype(np.float64) 292 self.array = ar 293 294 @staticmethod 295 def parse(s): 296 """ 297 Parse string representation back into the DenseVector. 298 299 >>> DenseVector.parse(' [ 0.0,1.0,2.0, 3.0]') 300 DenseVector([0.0, 1.0, 2.0, 3.0]) 301 """ 302 start = s.find('[') 303 if start == -1: 304 raise ValueError("Array should start with '['.") 305 end = s.find(']') 306 if end == -1: 307 raise ValueError("Array should end with ']'.") 308 s = s[start + 1: end] 309 310 try: 311 values = [float(val) for val in s.split(',') if val] 312 except ValueError: 313 raise ValueError("Unable to parse values from %s" % s) 314 return DenseVector(values) 315 316 def __reduce__(self): 317 return DenseVector, (self.array.tostring(),) 318 319 def numNonzeros(self): 320 """ 321 Number of nonzero elements. This scans all active values and count non zeros 322 """ 323 return np.count_nonzero(self.array) 324 325 def norm(self, p): 326 """ 327 Calculates the norm of a DenseVector. 328 329 >>> a = DenseVector([0, -1, 2, -3]) 330 >>> a.norm(2) 331 3.7... 332 >>> a.norm(1) 333 6.0 334 """ 335 return np.linalg.norm(self.array, p) 336 337 def dot(self, other): 338 """ 339 Compute the dot product of two Vectors. We support 340 (Numpy array, list, SparseVector, or SciPy sparse) 341 and a target NumPy array that is either 1- or 2-dimensional. 342 Equivalent to calling numpy.dot of the two vectors. 343 344 >>> dense = DenseVector(array.array('d', [1., 2.])) 345 >>> dense.dot(dense) 346 5.0 347 >>> dense.dot(SparseVector(2, [0, 1], [2., 1.])) 348 4.0 349 >>> dense.dot(range(1, 3)) 350 5.0 351 >>> dense.dot(np.array(range(1, 3))) 352 5.0 353 >>> dense.dot([1.,]) 354 Traceback (most recent call last): 355 ... 356 AssertionError: dimension mismatch 357 >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F')) 358 array([ 5., 11.]) 359 >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F')) 360 Traceback (most recent call last): 361 ... 362 AssertionError: dimension mismatch 363 """ 364 if type(other) == np.ndarray: 365 if other.ndim > 1: 366 assert len(self) == other.shape[0], "dimension mismatch" 367 return np.dot(self.array, other) 368 elif _have_scipy and scipy.sparse.issparse(other): 369 assert len(self) == other.shape[0], "dimension mismatch" 370 return other.transpose().dot(self.toArray()) 371 else: 372 assert len(self) == _vector_size(other), "dimension mismatch" 373 if isinstance(other, SparseVector): 374 return other.dot(self) 375 elif isinstance(other, Vector): 376 return np.dot(self.toArray(), other.toArray()) 377 else: 378 return np.dot(self.toArray(), other) 379 380 def squared_distance(self, other): 381 """ 382 Squared distance of two Vectors. 383 384 >>> dense1 = DenseVector(array.array('d', [1., 2.])) 385 >>> dense1.squared_distance(dense1) 386 0.0 387 >>> dense2 = np.array([2., 1.]) 388 >>> dense1.squared_distance(dense2) 389 2.0 390 >>> dense3 = [2., 1.] 391 >>> dense1.squared_distance(dense3) 392 2.0 393 >>> sparse1 = SparseVector(2, [0, 1], [2., 1.]) 394 >>> dense1.squared_distance(sparse1) 395 2.0 396 >>> dense1.squared_distance([1.,]) 397 Traceback (most recent call last): 398 ... 399 AssertionError: dimension mismatch 400 >>> dense1.squared_distance(SparseVector(1, [0,], [1.,])) 401 Traceback (most recent call last): 402 ... 403 AssertionError: dimension mismatch 404 """ 405 assert len(self) == _vector_size(other), "dimension mismatch" 406 if isinstance(other, SparseVector): 407 return other.squared_distance(self) 408 elif _have_scipy and scipy.sparse.issparse(other): 409 return _convert_to_vector(other).squared_distance(self) 410 411 if isinstance(other, Vector): 412 other = other.toArray() 413 elif not isinstance(other, np.ndarray): 414 other = np.array(other) 415 diff = self.toArray() - other 416 return np.dot(diff, diff) 417 418 def toArray(self): 419 """ 420 Returns an numpy.ndarray 421 """ 422 return self.array 423 424 def asML(self): 425 """ 426 Convert this vector to the new mllib-local representation. 427 This does NOT copy the data; it copies references. 428 429 :return: :py:class:`pyspark.ml.linalg.DenseVector` 430 431 .. versionadded:: 2.0.0 432 """ 433 return newlinalg.DenseVector(self.array) 434 435 @property 436 def values(self): 437 """ 438 Returns a list of values 439 """ 440 return self.array 441 442 def __getitem__(self, item): 443 return self.array[item] 444 445 def __len__(self): 446 return len(self.array) 447 448 def __str__(self): 449 return "[" + ",".join([str(v) for v in self.array]) + "]" 450 451 def __repr__(self): 452 return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array)) 453 454 def __eq__(self, other): 455 if isinstance(other, DenseVector): 456 return np.array_equal(self.array, other.array) 457 elif isinstance(other, SparseVector): 458 if len(self) != other.size: 459 return False 460 return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values) 461 return False 462 463 def __ne__(self, other): 464 return not self == other 465 466 def __hash__(self): 467 size = len(self) 468 result = 31 + size 469 nnz = 0 470 i = 0 471 while i < size and nnz < 128: 472 if self.array[i] != 0: 473 result = 31 * result + i 474 bits = _double_to_long_bits(self.array[i]) 475 result = 31 * result + (bits ^ (bits >> 32)) 476 nnz += 1 477 i += 1 478 return result 479 480 def __getattr__(self, item): 481 return getattr(self.array, item) 482 483 def _delegate(op): 484 def func(self, other): 485 if isinstance(other, DenseVector): 486 other = other.array 487 return DenseVector(getattr(self.array, op)(other)) 488 return func 489 490 __neg__ = _delegate("__neg__") 491 __add__ = _delegate("__add__") 492 __sub__ = _delegate("__sub__") 493 __mul__ = _delegate("__mul__") 494 __div__ = _delegate("__div__") 495 __truediv__ = _delegate("__truediv__") 496 __mod__ = _delegate("__mod__") 497 __radd__ = _delegate("__radd__") 498 __rsub__ = _delegate("__rsub__") 499 __rmul__ = _delegate("__rmul__") 500 __rdiv__ = _delegate("__rdiv__") 501 __rtruediv__ = _delegate("__rtruediv__") 502 __rmod__ = _delegate("__rmod__") 503 504 505class SparseVector(Vector): 506 """ 507 A simple sparse vector class for passing data to MLlib. Users may 508 alternatively pass SciPy's {scipy.sparse} data types. 509 """ 510 def __init__(self, size, *args): 511 """ 512 Create a sparse vector, using either a dictionary, a list of 513 (index, value) pairs, or two separate arrays of indices and 514 values (sorted by index). 515 516 :param size: Size of the vector. 517 :param args: Active entries, as a dictionary {index: value, ...}, 518 a list of tuples [(index, value), ...], or a list of strictly 519 increasing indices and a list of corresponding values [index, ...], 520 [value, ...]. Inactive entries are treated as zeros. 521 522 >>> SparseVector(4, {1: 1.0, 3: 5.5}) 523 SparseVector(4, {1: 1.0, 3: 5.5}) 524 >>> SparseVector(4, [(1, 1.0), (3, 5.5)]) 525 SparseVector(4, {1: 1.0, 3: 5.5}) 526 >>> SparseVector(4, [1, 3], [1.0, 5.5]) 527 SparseVector(4, {1: 1.0, 3: 5.5}) 528 """ 529 self.size = int(size) 530 """ Size of the vector. """ 531 assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments" 532 if len(args) == 1: 533 pairs = args[0] 534 if type(pairs) == dict: 535 pairs = pairs.items() 536 pairs = sorted(pairs) 537 self.indices = np.array([p[0] for p in pairs], dtype=np.int32) 538 """ A list of indices corresponding to active entries. """ 539 self.values = np.array([p[1] for p in pairs], dtype=np.float64) 540 """ A list of values corresponding to active entries. """ 541 else: 542 if isinstance(args[0], bytes): 543 assert isinstance(args[1], bytes), "values should be string too" 544 if args[0]: 545 self.indices = np.frombuffer(args[0], np.int32) 546 self.values = np.frombuffer(args[1], np.float64) 547 else: 548 # np.frombuffer() doesn't work well with empty string in older version 549 self.indices = np.array([], dtype=np.int32) 550 self.values = np.array([], dtype=np.float64) 551 else: 552 self.indices = np.array(args[0], dtype=np.int32) 553 self.values = np.array(args[1], dtype=np.float64) 554 assert len(self.indices) == len(self.values), "index and value arrays not same length" 555 for i in xrange(len(self.indices) - 1): 556 if self.indices[i] >= self.indices[i + 1]: 557 raise TypeError( 558 "Indices %s and %s are not strictly increasing" 559 % (self.indices[i], self.indices[i + 1])) 560 561 def numNonzeros(self): 562 """ 563 Number of nonzero elements. This scans all active values and count non zeros. 564 """ 565 return np.count_nonzero(self.values) 566 567 def norm(self, p): 568 """ 569 Calculates the norm of a SparseVector. 570 571 >>> a = SparseVector(4, [0, 1], [3., -4.]) 572 >>> a.norm(1) 573 7.0 574 >>> a.norm(2) 575 5.0 576 """ 577 return np.linalg.norm(self.values, p) 578 579 def __reduce__(self): 580 return ( 581 SparseVector, 582 (self.size, self.indices.tostring(), self.values.tostring())) 583 584 @staticmethod 585 def parse(s): 586 """ 587 Parse string representation back into the SparseVector. 588 589 >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )') 590 SparseVector(4, {0: 4.0, 1: 5.0}) 591 """ 592 start = s.find('(') 593 if start == -1: 594 raise ValueError("Tuple should start with '('") 595 end = s.find(')') 596 if end == -1: 597 raise ValueError("Tuple should end with ')'") 598 s = s[start + 1: end].strip() 599 600 size = s[: s.find(',')] 601 try: 602 size = int(size) 603 except ValueError: 604 raise ValueError("Cannot parse size %s." % size) 605 606 ind_start = s.find('[') 607 if ind_start == -1: 608 raise ValueError("Indices array should start with '['.") 609 ind_end = s.find(']') 610 if ind_end == -1: 611 raise ValueError("Indices array should end with ']'") 612 new_s = s[ind_start + 1: ind_end] 613 ind_list = new_s.split(',') 614 try: 615 indices = [int(ind) for ind in ind_list if ind] 616 except ValueError: 617 raise ValueError("Unable to parse indices from %s." % new_s) 618 s = s[ind_end + 1:].strip() 619 620 val_start = s.find('[') 621 if val_start == -1: 622 raise ValueError("Values array should start with '['.") 623 val_end = s.find(']') 624 if val_end == -1: 625 raise ValueError("Values array should end with ']'.") 626 val_list = s[val_start + 1: val_end].split(',') 627 try: 628 values = [float(val) for val in val_list if val] 629 except ValueError: 630 raise ValueError("Unable to parse values from %s." % s) 631 return SparseVector(size, indices, values) 632 633 def dot(self, other): 634 """ 635 Dot product with a SparseVector or 1- or 2-dimensional Numpy array. 636 637 >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 638 >>> a.dot(a) 639 25.0 640 >>> a.dot(array.array('d', [1., 2., 3., 4.])) 641 22.0 642 >>> b = SparseVector(4, [2], [1.0]) 643 >>> a.dot(b) 644 0.0 645 >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]])) 646 array([ 22., 22.]) 647 >>> a.dot([1., 2., 3.]) 648 Traceback (most recent call last): 649 ... 650 AssertionError: dimension mismatch 651 >>> a.dot(np.array([1., 2.])) 652 Traceback (most recent call last): 653 ... 654 AssertionError: dimension mismatch 655 >>> a.dot(DenseVector([1., 2.])) 656 Traceback (most recent call last): 657 ... 658 AssertionError: dimension mismatch 659 >>> a.dot(np.zeros((3, 2))) 660 Traceback (most recent call last): 661 ... 662 AssertionError: dimension mismatch 663 """ 664 665 if isinstance(other, np.ndarray): 666 if other.ndim not in [2, 1]: 667 raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim) 668 assert len(self) == other.shape[0], "dimension mismatch" 669 return np.dot(self.values, other[self.indices]) 670 671 assert len(self) == _vector_size(other), "dimension mismatch" 672 673 if isinstance(other, DenseVector): 674 return np.dot(other.array[self.indices], self.values) 675 676 elif isinstance(other, SparseVector): 677 # Find out common indices. 678 self_cmind = np.in1d(self.indices, other.indices, assume_unique=True) 679 self_values = self.values[self_cmind] 680 if self_values.size == 0: 681 return 0.0 682 else: 683 other_cmind = np.in1d(other.indices, self.indices, assume_unique=True) 684 return np.dot(self_values, other.values[other_cmind]) 685 686 else: 687 return self.dot(_convert_to_vector(other)) 688 689 def squared_distance(self, other): 690 """ 691 Squared distance from a SparseVector or 1-dimensional NumPy array. 692 693 >>> a = SparseVector(4, [1, 3], [3.0, 4.0]) 694 >>> a.squared_distance(a) 695 0.0 696 >>> a.squared_distance(array.array('d', [1., 2., 3., 4.])) 697 11.0 698 >>> a.squared_distance(np.array([1., 2., 3., 4.])) 699 11.0 700 >>> b = SparseVector(4, [2], [1.0]) 701 >>> a.squared_distance(b) 702 26.0 703 >>> b.squared_distance(a) 704 26.0 705 >>> b.squared_distance([1., 2.]) 706 Traceback (most recent call last): 707 ... 708 AssertionError: dimension mismatch 709 >>> b.squared_distance(SparseVector(3, [1,], [1.0,])) 710 Traceback (most recent call last): 711 ... 712 AssertionError: dimension mismatch 713 """ 714 assert len(self) == _vector_size(other), "dimension mismatch" 715 716 if isinstance(other, np.ndarray) or isinstance(other, DenseVector): 717 if isinstance(other, np.ndarray) and other.ndim != 1: 718 raise Exception("Cannot call squared_distance with %d-dimensional array" % 719 other.ndim) 720 if isinstance(other, DenseVector): 721 other = other.array 722 sparse_ind = np.zeros(other.size, dtype=bool) 723 sparse_ind[self.indices] = True 724 dist = other[sparse_ind] - self.values 725 result = np.dot(dist, dist) 726 727 other_ind = other[~sparse_ind] 728 result += np.dot(other_ind, other_ind) 729 return result 730 731 elif isinstance(other, SparseVector): 732 result = 0.0 733 i, j = 0, 0 734 while i < len(self.indices) and j < len(other.indices): 735 if self.indices[i] == other.indices[j]: 736 diff = self.values[i] - other.values[j] 737 result += diff * diff 738 i += 1 739 j += 1 740 elif self.indices[i] < other.indices[j]: 741 result += self.values[i] * self.values[i] 742 i += 1 743 else: 744 result += other.values[j] * other.values[j] 745 j += 1 746 while i < len(self.indices): 747 result += self.values[i] * self.values[i] 748 i += 1 749 while j < len(other.indices): 750 result += other.values[j] * other.values[j] 751 j += 1 752 return result 753 else: 754 return self.squared_distance(_convert_to_vector(other)) 755 756 def toArray(self): 757 """ 758 Returns a copy of this SparseVector as a 1-dimensional NumPy array. 759 """ 760 arr = np.zeros((self.size,), dtype=np.float64) 761 arr[self.indices] = self.values 762 return arr 763 764 def asML(self): 765 """ 766 Convert this vector to the new mllib-local representation. 767 This does NOT copy the data; it copies references. 768 769 :return: :py:class:`pyspark.ml.linalg.SparseVector` 770 771 .. versionadded:: 2.0.0 772 """ 773 return newlinalg.SparseVector(self.size, self.indices, self.values) 774 775 def __len__(self): 776 return self.size 777 778 def __str__(self): 779 inds = "[" + ",".join([str(i) for i in self.indices]) + "]" 780 vals = "[" + ",".join([str(v) for v in self.values]) + "]" 781 return "(" + ",".join((str(self.size), inds, vals)) + ")" 782 783 def __repr__(self): 784 inds = self.indices 785 vals = self.values 786 entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i])) 787 for i in xrange(len(inds))]) 788 return "SparseVector({0}, {{{1}}})".format(self.size, entries) 789 790 def __eq__(self, other): 791 if isinstance(other, SparseVector): 792 return other.size == self.size and np.array_equal(other.indices, self.indices) \ 793 and np.array_equal(other.values, self.values) 794 elif isinstance(other, DenseVector): 795 if self.size != len(other): 796 return False 797 return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array) 798 return False 799 800 def __getitem__(self, index): 801 inds = self.indices 802 vals = self.values 803 if not isinstance(index, int): 804 raise TypeError( 805 "Indices must be of type integer, got type %s" % type(index)) 806 807 if index >= self.size or index < -self.size: 808 raise IndexError("Index %d out of bounds." % index) 809 if index < 0: 810 index += self.size 811 812 if (inds.size == 0) or (index > inds.item(-1)): 813 return 0. 814 815 insert_index = np.searchsorted(inds, index) 816 row_ind = inds[insert_index] 817 if row_ind == index: 818 return vals[insert_index] 819 return 0. 820 821 def __ne__(self, other): 822 return not self.__eq__(other) 823 824 def __hash__(self): 825 result = 31 + self.size 826 nnz = 0 827 i = 0 828 while i < len(self.values) and nnz < 128: 829 if self.values[i] != 0: 830 result = 31 * result + int(self.indices[i]) 831 bits = _double_to_long_bits(self.values[i]) 832 result = 31 * result + (bits ^ (bits >> 32)) 833 nnz += 1 834 i += 1 835 return result 836 837 838class Vectors(object): 839 840 """ 841 Factory methods for working with vectors. 842 843 .. note:: Dense vectors are simply represented as NumPy array objects, 844 so there is no need to covert them for use in MLlib. For sparse vectors, 845 the factory methods in this class create an MLlib-compatible type, or users 846 can pass in SciPy's C{scipy.sparse} column vectors. 847 """ 848 849 @staticmethod 850 def sparse(size, *args): 851 """ 852 Create a sparse vector, using either a dictionary, a list of 853 (index, value) pairs, or two separate arrays of indices and 854 values (sorted by index). 855 856 :param size: Size of the vector. 857 :param args: Non-zero entries, as a dictionary, list of tuples, 858 or two sorted lists containing indices and values. 859 860 >>> Vectors.sparse(4, {1: 1.0, 3: 5.5}) 861 SparseVector(4, {1: 1.0, 3: 5.5}) 862 >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)]) 863 SparseVector(4, {1: 1.0, 3: 5.5}) 864 >>> Vectors.sparse(4, [1, 3], [1.0, 5.5]) 865 SparseVector(4, {1: 1.0, 3: 5.5}) 866 """ 867 return SparseVector(size, *args) 868 869 @staticmethod 870 def dense(*elements): 871 """ 872 Create a dense vector of 64-bit floats from a Python list or numbers. 873 874 >>> Vectors.dense([1, 2, 3]) 875 DenseVector([1.0, 2.0, 3.0]) 876 >>> Vectors.dense(1.0, 2.0) 877 DenseVector([1.0, 2.0]) 878 """ 879 if len(elements) == 1 and not isinstance(elements[0], (float, int, long)): 880 # it's list, numpy.array or other iterable object. 881 elements = elements[0] 882 return DenseVector(elements) 883 884 @staticmethod 885 def fromML(vec): 886 """ 887 Convert a vector from the new mllib-local representation. 888 This does NOT copy the data; it copies references. 889 890 :param vec: a :py:class:`pyspark.ml.linalg.Vector` 891 :return: a :py:class:`pyspark.mllib.linalg.Vector` 892 893 .. versionadded:: 2.0.0 894 """ 895 if isinstance(vec, newlinalg.DenseVector): 896 return DenseVector(vec.array) 897 elif isinstance(vec, newlinalg.SparseVector): 898 return SparseVector(vec.size, vec.indices, vec.values) 899 else: 900 raise TypeError("Unsupported vector type %s" % type(vec)) 901 902 @staticmethod 903 def stringify(vector): 904 """ 905 Converts a vector into a string, which can be recognized by 906 Vectors.parse(). 907 908 >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0])) 909 '(2,[1],[1.0])' 910 >>> Vectors.stringify(Vectors.dense([0.0, 1.0])) 911 '[0.0,1.0]' 912 """ 913 return str(vector) 914 915 @staticmethod 916 def squared_distance(v1, v2): 917 """ 918 Squared distance between two vectors. 919 a and b can be of type SparseVector, DenseVector, np.ndarray 920 or array.array. 921 922 >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) 923 >>> b = Vectors.dense([2, 5, 4, 1]) 924 >>> a.squared_distance(b) 925 51.0 926 """ 927 v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2) 928 return v1.squared_distance(v2) 929 930 @staticmethod 931 def norm(vector, p): 932 """ 933 Find norm of the given vector. 934 """ 935 return _convert_to_vector(vector).norm(p) 936 937 @staticmethod 938 def parse(s): 939 """Parse a string representation back into the Vector. 940 941 >>> Vectors.parse('[2,1,2 ]') 942 DenseVector([2.0, 1.0, 2.0]) 943 >>> Vectors.parse(' ( 100, [0], [2])') 944 SparseVector(100, {0: 2.0}) 945 """ 946 if s.find('(') == -1 and s.find('[') != -1: 947 return DenseVector.parse(s) 948 elif s.find('(') != -1: 949 return SparseVector.parse(s) 950 else: 951 raise ValueError( 952 "Cannot find tokens '[' or '(' from the input string.") 953 954 @staticmethod 955 def zeros(size): 956 return DenseVector(np.zeros(size)) 957 958 @staticmethod 959 def _equals(v1_indices, v1_values, v2_indices, v2_values): 960 """ 961 Check equality between sparse/dense vectors, 962 v1_indices and v2_indices assume to be strictly increasing. 963 """ 964 v1_size = len(v1_values) 965 v2_size = len(v2_values) 966 k1 = 0 967 k2 = 0 968 all_equal = True 969 while all_equal: 970 while k1 < v1_size and v1_values[k1] == 0: 971 k1 += 1 972 while k2 < v2_size and v2_values[k2] == 0: 973 k2 += 1 974 975 if k1 >= v1_size or k2 >= v2_size: 976 return k1 >= v1_size and k2 >= v2_size 977 978 all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2] 979 k1 += 1 980 k2 += 1 981 return all_equal 982 983 984class Matrix(object): 985 986 __UDT__ = MatrixUDT() 987 988 """ 989 Represents a local matrix. 990 """ 991 def __init__(self, numRows, numCols, isTransposed=False): 992 self.numRows = numRows 993 self.numCols = numCols 994 self.isTransposed = isTransposed 995 996 def toArray(self): 997 """ 998 Returns its elements in a NumPy ndarray. 999 """ 1000 raise NotImplementedError 1001 1002 def asML(self): 1003 """ 1004 Convert this matrix to the new mllib-local representation. 1005 This does NOT copy the data; it copies references. 1006 """ 1007 raise NotImplementedError 1008 1009 @staticmethod 1010 def _convert_to_array(array_like, dtype): 1011 """ 1012 Convert Matrix attributes which are array-like or buffer to array. 1013 """ 1014 if isinstance(array_like, bytes): 1015 return np.frombuffer(array_like, dtype=dtype) 1016 return np.asarray(array_like, dtype=dtype) 1017 1018 1019class DenseMatrix(Matrix): 1020 """ 1021 Column-major dense matrix. 1022 """ 1023 def __init__(self, numRows, numCols, values, isTransposed=False): 1024 Matrix.__init__(self, numRows, numCols, isTransposed) 1025 values = self._convert_to_array(values, np.float64) 1026 assert len(values) == numRows * numCols 1027 self.values = values 1028 1029 def __reduce__(self): 1030 return DenseMatrix, ( 1031 self.numRows, self.numCols, self.values.tostring(), 1032 int(self.isTransposed)) 1033 1034 def __str__(self): 1035 """ 1036 Pretty printing of a DenseMatrix 1037 1038 >>> dm = DenseMatrix(2, 2, range(4)) 1039 >>> print(dm) 1040 DenseMatrix([[ 0., 2.], 1041 [ 1., 3.]]) 1042 >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True) 1043 >>> print(dm) 1044 DenseMatrix([[ 0., 1.], 1045 [ 2., 3.]]) 1046 """ 1047 # Inspired by __repr__ in scipy matrices. 1048 array_lines = repr(self.toArray()).splitlines() 1049 1050 # We need to adjust six spaces which is the difference in number 1051 # of letters between "DenseMatrix" and "array" 1052 x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]]) 1053 return array_lines[0].replace("array", "DenseMatrix") + "\n" + x 1054 1055 def __repr__(self): 1056 """ 1057 Representation of a DenseMatrix 1058 1059 >>> dm = DenseMatrix(2, 2, range(4)) 1060 >>> dm 1061 DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) 1062 """ 1063 # If the number of values are less than seventeen then return as it is. 1064 # Else return first eight values and last eight values. 1065 if len(self.values) < 17: 1066 entries = _format_float_list(self.values) 1067 else: 1068 entries = ( 1069 _format_float_list(self.values[:8]) + 1070 ["..."] + 1071 _format_float_list(self.values[-8:]) 1072 ) 1073 1074 entries = ", ".join(entries) 1075 return "DenseMatrix({0}, {1}, [{2}], {3})".format( 1076 self.numRows, self.numCols, entries, self.isTransposed) 1077 1078 def toArray(self): 1079 """ 1080 Return an numpy.ndarray 1081 1082 >>> m = DenseMatrix(2, 2, range(4)) 1083 >>> m.toArray() 1084 array([[ 0., 2.], 1085 [ 1., 3.]]) 1086 """ 1087 if self.isTransposed: 1088 return np.asfortranarray( 1089 self.values.reshape((self.numRows, self.numCols))) 1090 else: 1091 return self.values.reshape((self.numRows, self.numCols), order='F') 1092 1093 def toSparse(self): 1094 """Convert to SparseMatrix""" 1095 if self.isTransposed: 1096 values = np.ravel(self.toArray(), order='F') 1097 else: 1098 values = self.values 1099 indices = np.nonzero(values)[0] 1100 colCounts = np.bincount(indices // self.numRows) 1101 colPtrs = np.cumsum(np.hstack( 1102 (0, colCounts, np.zeros(self.numCols - colCounts.size)))) 1103 values = values[indices] 1104 rowIndices = indices % self.numRows 1105 1106 return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values) 1107 1108 def asML(self): 1109 """ 1110 Convert this matrix to the new mllib-local representation. 1111 This does NOT copy the data; it copies references. 1112 1113 :return: :py:class:`pyspark.ml.linalg.DenseMatrix` 1114 1115 .. versionadded:: 2.0.0 1116 """ 1117 return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed) 1118 1119 def __getitem__(self, indices): 1120 i, j = indices 1121 if i < 0 or i >= self.numRows: 1122 raise IndexError("Row index %d is out of range [0, %d)" 1123 % (i, self.numRows)) 1124 if j >= self.numCols or j < 0: 1125 raise IndexError("Column index %d is out of range [0, %d)" 1126 % (j, self.numCols)) 1127 1128 if self.isTransposed: 1129 return self.values[i * self.numCols + j] 1130 else: 1131 return self.values[i + j * self.numRows] 1132 1133 def __eq__(self, other): 1134 if (not isinstance(other, DenseMatrix) or 1135 self.numRows != other.numRows or 1136 self.numCols != other.numCols): 1137 return False 1138 1139 self_values = np.ravel(self.toArray(), order='F') 1140 other_values = np.ravel(other.toArray(), order='F') 1141 return all(self_values == other_values) 1142 1143 1144class SparseMatrix(Matrix): 1145 """Sparse Matrix stored in CSC format.""" 1146 def __init__(self, numRows, numCols, colPtrs, rowIndices, values, 1147 isTransposed=False): 1148 Matrix.__init__(self, numRows, numCols, isTransposed) 1149 self.colPtrs = self._convert_to_array(colPtrs, np.int32) 1150 self.rowIndices = self._convert_to_array(rowIndices, np.int32) 1151 self.values = self._convert_to_array(values, np.float64) 1152 1153 if self.isTransposed: 1154 if self.colPtrs.size != numRows + 1: 1155 raise ValueError("Expected colPtrs of size %d, got %d." 1156 % (numRows + 1, self.colPtrs.size)) 1157 else: 1158 if self.colPtrs.size != numCols + 1: 1159 raise ValueError("Expected colPtrs of size %d, got %d." 1160 % (numCols + 1, self.colPtrs.size)) 1161 if self.rowIndices.size != self.values.size: 1162 raise ValueError("Expected rowIndices of length %d, got %d." 1163 % (self.rowIndices.size, self.values.size)) 1164 1165 def __str__(self): 1166 """ 1167 Pretty printing of a SparseMatrix 1168 1169 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) 1170 >>> print(sm1) 1171 2 X 2 CSCMatrix 1172 (0,0) 2.0 1173 (1,0) 3.0 1174 (1,1) 4.0 1175 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) 1176 >>> print(sm1) 1177 2 X 2 CSRMatrix 1178 (0,0) 2.0 1179 (0,1) 3.0 1180 (1,1) 4.0 1181 """ 1182 spstr = "{0} X {1} ".format(self.numRows, self.numCols) 1183 if self.isTransposed: 1184 spstr += "CSRMatrix\n" 1185 else: 1186 spstr += "CSCMatrix\n" 1187 1188 cur_col = 0 1189 smlist = [] 1190 1191 # Display first 16 values. 1192 if len(self.values) <= 16: 1193 zipindval = zip(self.rowIndices, self.values) 1194 else: 1195 zipindval = zip(self.rowIndices[:16], self.values[:16]) 1196 for i, (rowInd, value) in enumerate(zipindval): 1197 if self.colPtrs[cur_col + 1] <= i: 1198 cur_col += 1 1199 if self.isTransposed: 1200 smlist.append('({0},{1}) {2}'.format( 1201 cur_col, rowInd, _format_float(value))) 1202 else: 1203 smlist.append('({0},{1}) {2}'.format( 1204 rowInd, cur_col, _format_float(value))) 1205 spstr += "\n".join(smlist) 1206 1207 if len(self.values) > 16: 1208 spstr += "\n.." * 2 1209 return spstr 1210 1211 def __repr__(self): 1212 """ 1213 Representation of a SparseMatrix 1214 1215 >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) 1216 >>> sm1 1217 SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False) 1218 """ 1219 rowIndices = list(self.rowIndices) 1220 colPtrs = list(self.colPtrs) 1221 1222 if len(self.values) <= 16: 1223 values = _format_float_list(self.values) 1224 1225 else: 1226 values = ( 1227 _format_float_list(self.values[:8]) + 1228 ["..."] + 1229 _format_float_list(self.values[-8:]) 1230 ) 1231 rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:] 1232 1233 if len(self.colPtrs) > 16: 1234 colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:] 1235 1236 values = ", ".join(values) 1237 rowIndices = ", ".join([str(ind) for ind in rowIndices]) 1238 colPtrs = ", ".join([str(ptr) for ptr in colPtrs]) 1239 return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format( 1240 self.numRows, self.numCols, colPtrs, rowIndices, 1241 values, self.isTransposed) 1242 1243 def __reduce__(self): 1244 return SparseMatrix, ( 1245 self.numRows, self.numCols, self.colPtrs.tostring(), 1246 self.rowIndices.tostring(), self.values.tostring(), 1247 int(self.isTransposed)) 1248 1249 def __getitem__(self, indices): 1250 i, j = indices 1251 if i < 0 or i >= self.numRows: 1252 raise IndexError("Row index %d is out of range [0, %d)" 1253 % (i, self.numRows)) 1254 if j < 0 or j >= self.numCols: 1255 raise IndexError("Column index %d is out of range [0, %d)" 1256 % (j, self.numCols)) 1257 1258 # If a CSR matrix is given, then the row index should be searched 1259 # for in ColPtrs, and the column index should be searched for in the 1260 # corresponding slice obtained from rowIndices. 1261 if self.isTransposed: 1262 j, i = i, j 1263 1264 colStart = self.colPtrs[j] 1265 colEnd = self.colPtrs[j + 1] 1266 nz = self.rowIndices[colStart: colEnd] 1267 ind = np.searchsorted(nz, i) + colStart 1268 if ind < colEnd and self.rowIndices[ind] == i: 1269 return self.values[ind] 1270 else: 1271 return 0.0 1272 1273 def toArray(self): 1274 """ 1275 Return an numpy.ndarray 1276 """ 1277 A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F') 1278 for k in xrange(self.colPtrs.size - 1): 1279 startptr = self.colPtrs[k] 1280 endptr = self.colPtrs[k + 1] 1281 if self.isTransposed: 1282 A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr] 1283 else: 1284 A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr] 1285 return A 1286 1287 def toDense(self): 1288 densevals = np.ravel(self.toArray(), order='F') 1289 return DenseMatrix(self.numRows, self.numCols, densevals) 1290 1291 def asML(self): 1292 """ 1293 Convert this matrix to the new mllib-local representation. 1294 This does NOT copy the data; it copies references. 1295 1296 :return: :py:class:`pyspark.ml.linalg.SparseMatrix` 1297 1298 .. versionadded:: 2.0.0 1299 """ 1300 return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices, 1301 self.values, self.isTransposed) 1302 1303 # TODO: More efficient implementation: 1304 def __eq__(self, other): 1305 return np.all(self.toArray() == other.toArray()) 1306 1307 1308class Matrices(object): 1309 @staticmethod 1310 def dense(numRows, numCols, values): 1311 """ 1312 Create a DenseMatrix 1313 """ 1314 return DenseMatrix(numRows, numCols, values) 1315 1316 @staticmethod 1317 def sparse(numRows, numCols, colPtrs, rowIndices, values): 1318 """ 1319 Create a SparseMatrix 1320 """ 1321 return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values) 1322 1323 @staticmethod 1324 def fromML(mat): 1325 """ 1326 Convert a matrix from the new mllib-local representation. 1327 This does NOT copy the data; it copies references. 1328 1329 :param mat: a :py:class:`pyspark.ml.linalg.Matrix` 1330 :return: a :py:class:`pyspark.mllib.linalg.Matrix` 1331 1332 .. versionadded:: 2.0.0 1333 """ 1334 if isinstance(mat, newlinalg.DenseMatrix): 1335 return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed) 1336 elif isinstance(mat, newlinalg.SparseMatrix): 1337 return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices, 1338 mat.values, mat.isTransposed) 1339 else: 1340 raise TypeError("Unsupported matrix type %s" % type(mat)) 1341 1342 1343class QRDecomposition(object): 1344 """ 1345 Represents QR factors. 1346 """ 1347 def __init__(self, Q, R): 1348 self._Q = Q 1349 self._R = R 1350 1351 @property 1352 @since('2.0.0') 1353 def Q(self): 1354 """ 1355 An orthogonal matrix Q in a QR decomposition. 1356 May be null if not computed. 1357 """ 1358 return self._Q 1359 1360 @property 1361 @since('2.0.0') 1362 def R(self): 1363 """ 1364 An upper triangular matrix R in a QR decomposition. 1365 """ 1366 return self._R 1367 1368 1369def _test(): 1370 import doctest 1371 (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS) 1372 if failure_count: 1373 exit(-1) 1374 1375if __name__ == "__main__": 1376 _test() 1377