1#
2# Licensed to the Apache Software Foundation (ASF) under one or more
3# contributor license agreements.  See the NOTICE file distributed with
4# this work for additional information regarding copyright ownership.
5# The ASF licenses this file to You under the Apache License, Version 2.0
6# (the "License"); you may not use this file except in compliance with
7# the License.  You may obtain a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""
19MLlib utilities for linear algebra. For dense vectors, MLlib
20uses the NumPy C{array} type, so you can simply pass NumPy arrays
21around. For sparse vectors, users can construct a L{SparseVector}
22object from MLlib or pass SciPy C{scipy.sparse} column vectors if
23SciPy is available in their environment.
24"""
25
26import sys
27import array
28import struct
29
30if sys.version >= '3':
31    basestring = str
32    xrange = range
33    import copyreg as copy_reg
34    long = int
35else:
36    from itertools import izip as zip
37    import copy_reg
38
39import numpy as np
40
41from pyspark import since
42from pyspark.ml import linalg as newlinalg
43from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
44    IntegerType, ByteType, BooleanType
45
46
47__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
48           'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices',
49           'QRDecomposition']
50
51
52if sys.version_info[:2] == (2, 7):
53    # speed up pickling array in Python 2.7
54    def fast_pickle_array(ar):
55        return array.array, (ar.typecode, ar.tostring())
56    copy_reg.pickle(array.array, fast_pickle_array)
57
58
59# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
60# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
61
62try:
63    import scipy.sparse
64    _have_scipy = True
65except:
66    # No SciPy in environment, but that's okay
67    _have_scipy = False
68
69
70def _convert_to_vector(l):
71    if isinstance(l, Vector):
72        return l
73    elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
74        return DenseVector(l)
75    elif _have_scipy and scipy.sparse.issparse(l):
76        assert l.shape[1] == 1, "Expected column vector"
77        # Make sure the converted csc_matrix has sorted indices.
78        csc = l.tocsc()
79        if not csc.has_sorted_indices:
80            csc.sort_indices()
81        return SparseVector(l.shape[0], csc.indices, csc.data)
82    else:
83        raise TypeError("Cannot convert type %s into Vector" % type(l))
84
85
86def _vector_size(v):
87    """
88    Returns the size of the vector.
89
90    >>> _vector_size([1., 2., 3.])
91    3
92    >>> _vector_size((1., 2., 3.))
93    3
94    >>> _vector_size(array.array('d', [1., 2., 3.]))
95    3
96    >>> _vector_size(np.zeros(3))
97    3
98    >>> _vector_size(np.zeros((3, 1)))
99    3
100    >>> _vector_size(np.zeros((1, 3)))
101    Traceback (most recent call last):
102        ...
103    ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
104    """
105    if isinstance(v, Vector):
106        return len(v)
107    elif type(v) in (array.array, list, tuple, xrange):
108        return len(v)
109    elif type(v) == np.ndarray:
110        if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
111            return len(v)
112        else:
113            raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
114    elif _have_scipy and scipy.sparse.issparse(v):
115        assert v.shape[1] == 1, "Expected column vector"
116        return v.shape[0]
117    else:
118        raise TypeError("Cannot treat type %s as a vector" % type(v))
119
120
121def _format_float(f, digits=4):
122    s = str(round(f, digits))
123    if '.' in s:
124        s = s[:s.index('.') + 1 + digits]
125    return s
126
127
128def _format_float_list(l):
129    return [_format_float(x) for x in l]
130
131
132def _double_to_long_bits(value):
133    if np.isnan(value):
134        value = float('nan')
135    # pack double into 64 bits, then unpack as long int
136    return struct.unpack('Q', struct.pack('d', value))[0]
137
138
139class VectorUDT(UserDefinedType):
140    """
141    SQL user-defined type (UDT) for Vector.
142    """
143
144    @classmethod
145    def sqlType(cls):
146        return StructType([
147            StructField("type", ByteType(), False),
148            StructField("size", IntegerType(), True),
149            StructField("indices", ArrayType(IntegerType(), False), True),
150            StructField("values", ArrayType(DoubleType(), False), True)])
151
152    @classmethod
153    def module(cls):
154        return "pyspark.mllib.linalg"
155
156    @classmethod
157    def scalaUDT(cls):
158        return "org.apache.spark.mllib.linalg.VectorUDT"
159
160    def serialize(self, obj):
161        if isinstance(obj, SparseVector):
162            indices = [int(i) for i in obj.indices]
163            values = [float(v) for v in obj.values]
164            return (0, obj.size, indices, values)
165        elif isinstance(obj, DenseVector):
166            values = [float(v) for v in obj]
167            return (1, None, None, values)
168        else:
169            raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
170
171    def deserialize(self, datum):
172        assert len(datum) == 4, \
173            "VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
174        tpe = datum[0]
175        if tpe == 0:
176            return SparseVector(datum[1], datum[2], datum[3])
177        elif tpe == 1:
178            return DenseVector(datum[3])
179        else:
180            raise ValueError("do not recognize type %r" % tpe)
181
182    def simpleString(self):
183        return "vector"
184
185
186class MatrixUDT(UserDefinedType):
187    """
188    SQL user-defined type (UDT) for Matrix.
189    """
190
191    @classmethod
192    def sqlType(cls):
193        return StructType([
194            StructField("type", ByteType(), False),
195            StructField("numRows", IntegerType(), False),
196            StructField("numCols", IntegerType(), False),
197            StructField("colPtrs", ArrayType(IntegerType(), False), True),
198            StructField("rowIndices", ArrayType(IntegerType(), False), True),
199            StructField("values", ArrayType(DoubleType(), False), True),
200            StructField("isTransposed", BooleanType(), False)])
201
202    @classmethod
203    def module(cls):
204        return "pyspark.mllib.linalg"
205
206    @classmethod
207    def scalaUDT(cls):
208        return "org.apache.spark.mllib.linalg.MatrixUDT"
209
210    def serialize(self, obj):
211        if isinstance(obj, SparseMatrix):
212            colPtrs = [int(i) for i in obj.colPtrs]
213            rowIndices = [int(i) for i in obj.rowIndices]
214            values = [float(v) for v in obj.values]
215            return (0, obj.numRows, obj.numCols, colPtrs,
216                    rowIndices, values, bool(obj.isTransposed))
217        elif isinstance(obj, DenseMatrix):
218            values = [float(v) for v in obj.values]
219            return (1, obj.numRows, obj.numCols, None, None, values,
220                    bool(obj.isTransposed))
221        else:
222            raise TypeError("cannot serialize type %r" % (type(obj)))
223
224    def deserialize(self, datum):
225        assert len(datum) == 7, \
226            "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum)
227        tpe = datum[0]
228        if tpe == 0:
229            return SparseMatrix(*datum[1:])
230        elif tpe == 1:
231            return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
232        else:
233            raise ValueError("do not recognize type %r" % tpe)
234
235    def simpleString(self):
236        return "matrix"
237
238
239class Vector(object):
240
241    __UDT__ = VectorUDT()
242
243    """
244    Abstract class for DenseVector and SparseVector
245    """
246    def toArray(self):
247        """
248        Convert the vector into an numpy.ndarray
249
250        :return: numpy.ndarray
251        """
252        raise NotImplementedError
253
254    def asML(self):
255        """
256        Convert this vector to the new mllib-local representation.
257        This does NOT copy the data; it copies references.
258
259        :return: :py:class:`pyspark.ml.linalg.Vector`
260        """
261        raise NotImplementedError
262
263
264class DenseVector(Vector):
265    """
266    A dense vector represented by a value array. We use numpy array for
267    storage and arithmetics will be delegated to the underlying numpy
268    array.
269
270    >>> v = Vectors.dense([1.0, 2.0])
271    >>> u = Vectors.dense([3.0, 4.0])
272    >>> v + u
273    DenseVector([4.0, 6.0])
274    >>> 2 - v
275    DenseVector([1.0, 0.0])
276    >>> v / 2
277    DenseVector([0.5, 1.0])
278    >>> v * u
279    DenseVector([3.0, 8.0])
280    >>> u / v
281    DenseVector([3.0, 2.0])
282    >>> u % 2
283    DenseVector([1.0, 0.0])
284    """
285    def __init__(self, ar):
286        if isinstance(ar, bytes):
287            ar = np.frombuffer(ar, dtype=np.float64)
288        elif not isinstance(ar, np.ndarray):
289            ar = np.array(ar, dtype=np.float64)
290        if ar.dtype != np.float64:
291            ar = ar.astype(np.float64)
292        self.array = ar
293
294    @staticmethod
295    def parse(s):
296        """
297        Parse string representation back into the DenseVector.
298
299        >>> DenseVector.parse(' [ 0.0,1.0,2.0,  3.0]')
300        DenseVector([0.0, 1.0, 2.0, 3.0])
301        """
302        start = s.find('[')
303        if start == -1:
304            raise ValueError("Array should start with '['.")
305        end = s.find(']')
306        if end == -1:
307            raise ValueError("Array should end with ']'.")
308        s = s[start + 1: end]
309
310        try:
311            values = [float(val) for val in s.split(',') if val]
312        except ValueError:
313            raise ValueError("Unable to parse values from %s" % s)
314        return DenseVector(values)
315
316    def __reduce__(self):
317        return DenseVector, (self.array.tostring(),)
318
319    def numNonzeros(self):
320        """
321        Number of nonzero elements. This scans all active values and count non zeros
322        """
323        return np.count_nonzero(self.array)
324
325    def norm(self, p):
326        """
327        Calculates the norm of a DenseVector.
328
329        >>> a = DenseVector([0, -1, 2, -3])
330        >>> a.norm(2)
331        3.7...
332        >>> a.norm(1)
333        6.0
334        """
335        return np.linalg.norm(self.array, p)
336
337    def dot(self, other):
338        """
339        Compute the dot product of two Vectors. We support
340        (Numpy array, list, SparseVector, or SciPy sparse)
341        and a target NumPy array that is either 1- or 2-dimensional.
342        Equivalent to calling numpy.dot of the two vectors.
343
344        >>> dense = DenseVector(array.array('d', [1., 2.]))
345        >>> dense.dot(dense)
346        5.0
347        >>> dense.dot(SparseVector(2, [0, 1], [2., 1.]))
348        4.0
349        >>> dense.dot(range(1, 3))
350        5.0
351        >>> dense.dot(np.array(range(1, 3)))
352        5.0
353        >>> dense.dot([1.,])
354        Traceback (most recent call last):
355            ...
356        AssertionError: dimension mismatch
357        >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
358        array([  5.,  11.])
359        >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
360        Traceback (most recent call last):
361            ...
362        AssertionError: dimension mismatch
363        """
364        if type(other) == np.ndarray:
365            if other.ndim > 1:
366                assert len(self) == other.shape[0], "dimension mismatch"
367            return np.dot(self.array, other)
368        elif _have_scipy and scipy.sparse.issparse(other):
369            assert len(self) == other.shape[0], "dimension mismatch"
370            return other.transpose().dot(self.toArray())
371        else:
372            assert len(self) == _vector_size(other), "dimension mismatch"
373            if isinstance(other, SparseVector):
374                return other.dot(self)
375            elif isinstance(other, Vector):
376                return np.dot(self.toArray(), other.toArray())
377            else:
378                return np.dot(self.toArray(), other)
379
380    def squared_distance(self, other):
381        """
382        Squared distance of two Vectors.
383
384        >>> dense1 = DenseVector(array.array('d', [1., 2.]))
385        >>> dense1.squared_distance(dense1)
386        0.0
387        >>> dense2 = np.array([2., 1.])
388        >>> dense1.squared_distance(dense2)
389        2.0
390        >>> dense3 = [2., 1.]
391        >>> dense1.squared_distance(dense3)
392        2.0
393        >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
394        >>> dense1.squared_distance(sparse1)
395        2.0
396        >>> dense1.squared_distance([1.,])
397        Traceback (most recent call last):
398            ...
399        AssertionError: dimension mismatch
400        >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
401        Traceback (most recent call last):
402            ...
403        AssertionError: dimension mismatch
404        """
405        assert len(self) == _vector_size(other), "dimension mismatch"
406        if isinstance(other, SparseVector):
407            return other.squared_distance(self)
408        elif _have_scipy and scipy.sparse.issparse(other):
409            return _convert_to_vector(other).squared_distance(self)
410
411        if isinstance(other, Vector):
412            other = other.toArray()
413        elif not isinstance(other, np.ndarray):
414            other = np.array(other)
415        diff = self.toArray() - other
416        return np.dot(diff, diff)
417
418    def toArray(self):
419        """
420        Returns an numpy.ndarray
421        """
422        return self.array
423
424    def asML(self):
425        """
426        Convert this vector to the new mllib-local representation.
427        This does NOT copy the data; it copies references.
428
429        :return: :py:class:`pyspark.ml.linalg.DenseVector`
430
431        .. versionadded:: 2.0.0
432        """
433        return newlinalg.DenseVector(self.array)
434
435    @property
436    def values(self):
437        """
438        Returns a list of values
439        """
440        return self.array
441
442    def __getitem__(self, item):
443        return self.array[item]
444
445    def __len__(self):
446        return len(self.array)
447
448    def __str__(self):
449        return "[" + ",".join([str(v) for v in self.array]) + "]"
450
451    def __repr__(self):
452        return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
453
454    def __eq__(self, other):
455        if isinstance(other, DenseVector):
456            return np.array_equal(self.array, other.array)
457        elif isinstance(other, SparseVector):
458            if len(self) != other.size:
459                return False
460            return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
461        return False
462
463    def __ne__(self, other):
464        return not self == other
465
466    def __hash__(self):
467        size = len(self)
468        result = 31 + size
469        nnz = 0
470        i = 0
471        while i < size and nnz < 128:
472            if self.array[i] != 0:
473                result = 31 * result + i
474                bits = _double_to_long_bits(self.array[i])
475                result = 31 * result + (bits ^ (bits >> 32))
476                nnz += 1
477            i += 1
478        return result
479
480    def __getattr__(self, item):
481        return getattr(self.array, item)
482
483    def _delegate(op):
484        def func(self, other):
485            if isinstance(other, DenseVector):
486                other = other.array
487            return DenseVector(getattr(self.array, op)(other))
488        return func
489
490    __neg__ = _delegate("__neg__")
491    __add__ = _delegate("__add__")
492    __sub__ = _delegate("__sub__")
493    __mul__ = _delegate("__mul__")
494    __div__ = _delegate("__div__")
495    __truediv__ = _delegate("__truediv__")
496    __mod__ = _delegate("__mod__")
497    __radd__ = _delegate("__radd__")
498    __rsub__ = _delegate("__rsub__")
499    __rmul__ = _delegate("__rmul__")
500    __rdiv__ = _delegate("__rdiv__")
501    __rtruediv__ = _delegate("__rtruediv__")
502    __rmod__ = _delegate("__rmod__")
503
504
505class SparseVector(Vector):
506    """
507    A simple sparse vector class for passing data to MLlib. Users may
508    alternatively pass SciPy's {scipy.sparse} data types.
509    """
510    def __init__(self, size, *args):
511        """
512        Create a sparse vector, using either a dictionary, a list of
513        (index, value) pairs, or two separate arrays of indices and
514        values (sorted by index).
515
516        :param size: Size of the vector.
517        :param args: Active entries, as a dictionary {index: value, ...},
518          a list of tuples [(index, value), ...], or a list of strictly
519          increasing indices and a list of corresponding values [index, ...],
520          [value, ...]. Inactive entries are treated as zeros.
521
522        >>> SparseVector(4, {1: 1.0, 3: 5.5})
523        SparseVector(4, {1: 1.0, 3: 5.5})
524        >>> SparseVector(4, [(1, 1.0), (3, 5.5)])
525        SparseVector(4, {1: 1.0, 3: 5.5})
526        >>> SparseVector(4, [1, 3], [1.0, 5.5])
527        SparseVector(4, {1: 1.0, 3: 5.5})
528        """
529        self.size = int(size)
530        """ Size of the vector. """
531        assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
532        if len(args) == 1:
533            pairs = args[0]
534            if type(pairs) == dict:
535                pairs = pairs.items()
536            pairs = sorted(pairs)
537            self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
538            """ A list of indices corresponding to active entries. """
539            self.values = np.array([p[1] for p in pairs], dtype=np.float64)
540            """ A list of values corresponding to active entries. """
541        else:
542            if isinstance(args[0], bytes):
543                assert isinstance(args[1], bytes), "values should be string too"
544                if args[0]:
545                    self.indices = np.frombuffer(args[0], np.int32)
546                    self.values = np.frombuffer(args[1], np.float64)
547                else:
548                    # np.frombuffer() doesn't work well with empty string in older version
549                    self.indices = np.array([], dtype=np.int32)
550                    self.values = np.array([], dtype=np.float64)
551            else:
552                self.indices = np.array(args[0], dtype=np.int32)
553                self.values = np.array(args[1], dtype=np.float64)
554            assert len(self.indices) == len(self.values), "index and value arrays not same length"
555            for i in xrange(len(self.indices) - 1):
556                if self.indices[i] >= self.indices[i + 1]:
557                    raise TypeError(
558                        "Indices %s and %s are not strictly increasing"
559                        % (self.indices[i], self.indices[i + 1]))
560
561    def numNonzeros(self):
562        """
563        Number of nonzero elements. This scans all active values and count non zeros.
564        """
565        return np.count_nonzero(self.values)
566
567    def norm(self, p):
568        """
569        Calculates the norm of a SparseVector.
570
571        >>> a = SparseVector(4, [0, 1], [3., -4.])
572        >>> a.norm(1)
573        7.0
574        >>> a.norm(2)
575        5.0
576        """
577        return np.linalg.norm(self.values, p)
578
579    def __reduce__(self):
580        return (
581            SparseVector,
582            (self.size, self.indices.tostring(), self.values.tostring()))
583
584    @staticmethod
585    def parse(s):
586        """
587        Parse string representation back into the SparseVector.
588
589        >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )')
590        SparseVector(4, {0: 4.0, 1: 5.0})
591        """
592        start = s.find('(')
593        if start == -1:
594            raise ValueError("Tuple should start with '('")
595        end = s.find(')')
596        if end == -1:
597            raise ValueError("Tuple should end with ')'")
598        s = s[start + 1: end].strip()
599
600        size = s[: s.find(',')]
601        try:
602            size = int(size)
603        except ValueError:
604            raise ValueError("Cannot parse size %s." % size)
605
606        ind_start = s.find('[')
607        if ind_start == -1:
608            raise ValueError("Indices array should start with '['.")
609        ind_end = s.find(']')
610        if ind_end == -1:
611            raise ValueError("Indices array should end with ']'")
612        new_s = s[ind_start + 1: ind_end]
613        ind_list = new_s.split(',')
614        try:
615            indices = [int(ind) for ind in ind_list if ind]
616        except ValueError:
617            raise ValueError("Unable to parse indices from %s." % new_s)
618        s = s[ind_end + 1:].strip()
619
620        val_start = s.find('[')
621        if val_start == -1:
622            raise ValueError("Values array should start with '['.")
623        val_end = s.find(']')
624        if val_end == -1:
625            raise ValueError("Values array should end with ']'.")
626        val_list = s[val_start + 1: val_end].split(',')
627        try:
628            values = [float(val) for val in val_list if val]
629        except ValueError:
630            raise ValueError("Unable to parse values from %s." % s)
631        return SparseVector(size, indices, values)
632
633    def dot(self, other):
634        """
635        Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
636
637        >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
638        >>> a.dot(a)
639        25.0
640        >>> a.dot(array.array('d', [1., 2., 3., 4.]))
641        22.0
642        >>> b = SparseVector(4, [2], [1.0])
643        >>> a.dot(b)
644        0.0
645        >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
646        array([ 22.,  22.])
647        >>> a.dot([1., 2., 3.])
648        Traceback (most recent call last):
649            ...
650        AssertionError: dimension mismatch
651        >>> a.dot(np.array([1., 2.]))
652        Traceback (most recent call last):
653            ...
654        AssertionError: dimension mismatch
655        >>> a.dot(DenseVector([1., 2.]))
656        Traceback (most recent call last):
657            ...
658        AssertionError: dimension mismatch
659        >>> a.dot(np.zeros((3, 2)))
660        Traceback (most recent call last):
661            ...
662        AssertionError: dimension mismatch
663        """
664
665        if isinstance(other, np.ndarray):
666            if other.ndim not in [2, 1]:
667                raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
668            assert len(self) == other.shape[0], "dimension mismatch"
669            return np.dot(self.values, other[self.indices])
670
671        assert len(self) == _vector_size(other), "dimension mismatch"
672
673        if isinstance(other, DenseVector):
674            return np.dot(other.array[self.indices], self.values)
675
676        elif isinstance(other, SparseVector):
677            # Find out common indices.
678            self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
679            self_values = self.values[self_cmind]
680            if self_values.size == 0:
681                return 0.0
682            else:
683                other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
684                return np.dot(self_values, other.values[other_cmind])
685
686        else:
687            return self.dot(_convert_to_vector(other))
688
689    def squared_distance(self, other):
690        """
691        Squared distance from a SparseVector or 1-dimensional NumPy array.
692
693        >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
694        >>> a.squared_distance(a)
695        0.0
696        >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))
697        11.0
698        >>> a.squared_distance(np.array([1., 2., 3., 4.]))
699        11.0
700        >>> b = SparseVector(4, [2], [1.0])
701        >>> a.squared_distance(b)
702        26.0
703        >>> b.squared_distance(a)
704        26.0
705        >>> b.squared_distance([1., 2.])
706        Traceback (most recent call last):
707            ...
708        AssertionError: dimension mismatch
709        >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
710        Traceback (most recent call last):
711            ...
712        AssertionError: dimension mismatch
713        """
714        assert len(self) == _vector_size(other), "dimension mismatch"
715
716        if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
717            if isinstance(other, np.ndarray) and other.ndim != 1:
718                raise Exception("Cannot call squared_distance with %d-dimensional array" %
719                                other.ndim)
720            if isinstance(other, DenseVector):
721                other = other.array
722            sparse_ind = np.zeros(other.size, dtype=bool)
723            sparse_ind[self.indices] = True
724            dist = other[sparse_ind] - self.values
725            result = np.dot(dist, dist)
726
727            other_ind = other[~sparse_ind]
728            result += np.dot(other_ind, other_ind)
729            return result
730
731        elif isinstance(other, SparseVector):
732            result = 0.0
733            i, j = 0, 0
734            while i < len(self.indices) and j < len(other.indices):
735                if self.indices[i] == other.indices[j]:
736                    diff = self.values[i] - other.values[j]
737                    result += diff * diff
738                    i += 1
739                    j += 1
740                elif self.indices[i] < other.indices[j]:
741                    result += self.values[i] * self.values[i]
742                    i += 1
743                else:
744                    result += other.values[j] * other.values[j]
745                    j += 1
746            while i < len(self.indices):
747                result += self.values[i] * self.values[i]
748                i += 1
749            while j < len(other.indices):
750                result += other.values[j] * other.values[j]
751                j += 1
752            return result
753        else:
754            return self.squared_distance(_convert_to_vector(other))
755
756    def toArray(self):
757        """
758        Returns a copy of this SparseVector as a 1-dimensional NumPy array.
759        """
760        arr = np.zeros((self.size,), dtype=np.float64)
761        arr[self.indices] = self.values
762        return arr
763
764    def asML(self):
765        """
766        Convert this vector to the new mllib-local representation.
767        This does NOT copy the data; it copies references.
768
769        :return: :py:class:`pyspark.ml.linalg.SparseVector`
770
771        .. versionadded:: 2.0.0
772        """
773        return newlinalg.SparseVector(self.size, self.indices, self.values)
774
775    def __len__(self):
776        return self.size
777
778    def __str__(self):
779        inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
780        vals = "[" + ",".join([str(v) for v in self.values]) + "]"
781        return "(" + ",".join((str(self.size), inds, vals)) + ")"
782
783    def __repr__(self):
784        inds = self.indices
785        vals = self.values
786        entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
787                             for i in xrange(len(inds))])
788        return "SparseVector({0}, {{{1}}})".format(self.size, entries)
789
790    def __eq__(self, other):
791        if isinstance(other, SparseVector):
792            return other.size == self.size and np.array_equal(other.indices, self.indices) \
793                and np.array_equal(other.values, self.values)
794        elif isinstance(other, DenseVector):
795            if self.size != len(other):
796                return False
797            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
798        return False
799
800    def __getitem__(self, index):
801        inds = self.indices
802        vals = self.values
803        if not isinstance(index, int):
804            raise TypeError(
805                "Indices must be of type integer, got type %s" % type(index))
806
807        if index >= self.size or index < -self.size:
808            raise IndexError("Index %d out of bounds." % index)
809        if index < 0:
810            index += self.size
811
812        if (inds.size == 0) or (index > inds.item(-1)):
813            return 0.
814
815        insert_index = np.searchsorted(inds, index)
816        row_ind = inds[insert_index]
817        if row_ind == index:
818            return vals[insert_index]
819        return 0.
820
821    def __ne__(self, other):
822        return not self.__eq__(other)
823
824    def __hash__(self):
825        result = 31 + self.size
826        nnz = 0
827        i = 0
828        while i < len(self.values) and nnz < 128:
829            if self.values[i] != 0:
830                result = 31 * result + int(self.indices[i])
831                bits = _double_to_long_bits(self.values[i])
832                result = 31 * result + (bits ^ (bits >> 32))
833                nnz += 1
834            i += 1
835        return result
836
837
838class Vectors(object):
839
840    """
841    Factory methods for working with vectors.
842
843    .. note:: Dense vectors are simply represented as NumPy array objects,
844        so there is no need to covert them for use in MLlib. For sparse vectors,
845        the factory methods in this class create an MLlib-compatible type, or users
846        can pass in SciPy's C{scipy.sparse} column vectors.
847    """
848
849    @staticmethod
850    def sparse(size, *args):
851        """
852        Create a sparse vector, using either a dictionary, a list of
853        (index, value) pairs, or two separate arrays of indices and
854        values (sorted by index).
855
856        :param size: Size of the vector.
857        :param args: Non-zero entries, as a dictionary, list of tuples,
858                     or two sorted lists containing indices and values.
859
860        >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
861        SparseVector(4, {1: 1.0, 3: 5.5})
862        >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
863        SparseVector(4, {1: 1.0, 3: 5.5})
864        >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
865        SparseVector(4, {1: 1.0, 3: 5.5})
866        """
867        return SparseVector(size, *args)
868
869    @staticmethod
870    def dense(*elements):
871        """
872        Create a dense vector of 64-bit floats from a Python list or numbers.
873
874        >>> Vectors.dense([1, 2, 3])
875        DenseVector([1.0, 2.0, 3.0])
876        >>> Vectors.dense(1.0, 2.0)
877        DenseVector([1.0, 2.0])
878        """
879        if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
880            # it's list, numpy.array or other iterable object.
881            elements = elements[0]
882        return DenseVector(elements)
883
884    @staticmethod
885    def fromML(vec):
886        """
887        Convert a vector from the new mllib-local representation.
888        This does NOT copy the data; it copies references.
889
890        :param vec: a :py:class:`pyspark.ml.linalg.Vector`
891        :return: a :py:class:`pyspark.mllib.linalg.Vector`
892
893        .. versionadded:: 2.0.0
894        """
895        if isinstance(vec, newlinalg.DenseVector):
896            return DenseVector(vec.array)
897        elif isinstance(vec, newlinalg.SparseVector):
898            return SparseVector(vec.size, vec.indices, vec.values)
899        else:
900            raise TypeError("Unsupported vector type %s" % type(vec))
901
902    @staticmethod
903    def stringify(vector):
904        """
905        Converts a vector into a string, which can be recognized by
906        Vectors.parse().
907
908        >>> Vectors.stringify(Vectors.sparse(2, [1], [1.0]))
909        '(2,[1],[1.0])'
910        >>> Vectors.stringify(Vectors.dense([0.0, 1.0]))
911        '[0.0,1.0]'
912        """
913        return str(vector)
914
915    @staticmethod
916    def squared_distance(v1, v2):
917        """
918        Squared distance between two vectors.
919        a and b can be of type SparseVector, DenseVector, np.ndarray
920        or array.array.
921
922        >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
923        >>> b = Vectors.dense([2, 5, 4, 1])
924        >>> a.squared_distance(b)
925        51.0
926        """
927        v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
928        return v1.squared_distance(v2)
929
930    @staticmethod
931    def norm(vector, p):
932        """
933        Find norm of the given vector.
934        """
935        return _convert_to_vector(vector).norm(p)
936
937    @staticmethod
938    def parse(s):
939        """Parse a string representation back into the Vector.
940
941        >>> Vectors.parse('[2,1,2 ]')
942        DenseVector([2.0, 1.0, 2.0])
943        >>> Vectors.parse(' ( 100,  [0],  [2])')
944        SparseVector(100, {0: 2.0})
945        """
946        if s.find('(') == -1 and s.find('[') != -1:
947            return DenseVector.parse(s)
948        elif s.find('(') != -1:
949            return SparseVector.parse(s)
950        else:
951            raise ValueError(
952                "Cannot find tokens '[' or '(' from the input string.")
953
954    @staticmethod
955    def zeros(size):
956        return DenseVector(np.zeros(size))
957
958    @staticmethod
959    def _equals(v1_indices, v1_values, v2_indices, v2_values):
960        """
961        Check equality between sparse/dense vectors,
962        v1_indices and v2_indices assume to be strictly increasing.
963        """
964        v1_size = len(v1_values)
965        v2_size = len(v2_values)
966        k1 = 0
967        k2 = 0
968        all_equal = True
969        while all_equal:
970            while k1 < v1_size and v1_values[k1] == 0:
971                k1 += 1
972            while k2 < v2_size and v2_values[k2] == 0:
973                k2 += 1
974
975            if k1 >= v1_size or k2 >= v2_size:
976                return k1 >= v1_size and k2 >= v2_size
977
978            all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
979            k1 += 1
980            k2 += 1
981        return all_equal
982
983
984class Matrix(object):
985
986    __UDT__ = MatrixUDT()
987
988    """
989    Represents a local matrix.
990    """
991    def __init__(self, numRows, numCols, isTransposed=False):
992        self.numRows = numRows
993        self.numCols = numCols
994        self.isTransposed = isTransposed
995
996    def toArray(self):
997        """
998        Returns its elements in a NumPy ndarray.
999        """
1000        raise NotImplementedError
1001
1002    def asML(self):
1003        """
1004        Convert this matrix to the new mllib-local representation.
1005        This does NOT copy the data; it copies references.
1006        """
1007        raise NotImplementedError
1008
1009    @staticmethod
1010    def _convert_to_array(array_like, dtype):
1011        """
1012        Convert Matrix attributes which are array-like or buffer to array.
1013        """
1014        if isinstance(array_like, bytes):
1015            return np.frombuffer(array_like, dtype=dtype)
1016        return np.asarray(array_like, dtype=dtype)
1017
1018
1019class DenseMatrix(Matrix):
1020    """
1021    Column-major dense matrix.
1022    """
1023    def __init__(self, numRows, numCols, values, isTransposed=False):
1024        Matrix.__init__(self, numRows, numCols, isTransposed)
1025        values = self._convert_to_array(values, np.float64)
1026        assert len(values) == numRows * numCols
1027        self.values = values
1028
1029    def __reduce__(self):
1030        return DenseMatrix, (
1031            self.numRows, self.numCols, self.values.tostring(),
1032            int(self.isTransposed))
1033
1034    def __str__(self):
1035        """
1036        Pretty printing of a DenseMatrix
1037
1038        >>> dm = DenseMatrix(2, 2, range(4))
1039        >>> print(dm)
1040        DenseMatrix([[ 0.,  2.],
1041                     [ 1.,  3.]])
1042        >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True)
1043        >>> print(dm)
1044        DenseMatrix([[ 0.,  1.],
1045                     [ 2.,  3.]])
1046        """
1047        # Inspired by __repr__ in scipy matrices.
1048        array_lines = repr(self.toArray()).splitlines()
1049
1050        # We need to adjust six spaces which is the difference in number
1051        # of letters between "DenseMatrix" and "array"
1052        x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]])
1053        return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
1054
1055    def __repr__(self):
1056        """
1057        Representation of a DenseMatrix
1058
1059        >>> dm = DenseMatrix(2, 2, range(4))
1060        >>> dm
1061        DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)
1062        """
1063        # If the number of values are less than seventeen then return as it is.
1064        # Else return first eight values and last eight values.
1065        if len(self.values) < 17:
1066            entries = _format_float_list(self.values)
1067        else:
1068            entries = (
1069                _format_float_list(self.values[:8]) +
1070                ["..."] +
1071                _format_float_list(self.values[-8:])
1072            )
1073
1074        entries = ", ".join(entries)
1075        return "DenseMatrix({0}, {1}, [{2}], {3})".format(
1076            self.numRows, self.numCols, entries, self.isTransposed)
1077
1078    def toArray(self):
1079        """
1080        Return an numpy.ndarray
1081
1082        >>> m = DenseMatrix(2, 2, range(4))
1083        >>> m.toArray()
1084        array([[ 0.,  2.],
1085               [ 1.,  3.]])
1086        """
1087        if self.isTransposed:
1088            return np.asfortranarray(
1089                self.values.reshape((self.numRows, self.numCols)))
1090        else:
1091            return self.values.reshape((self.numRows, self.numCols), order='F')
1092
1093    def toSparse(self):
1094        """Convert to SparseMatrix"""
1095        if self.isTransposed:
1096            values = np.ravel(self.toArray(), order='F')
1097        else:
1098            values = self.values
1099        indices = np.nonzero(values)[0]
1100        colCounts = np.bincount(indices // self.numRows)
1101        colPtrs = np.cumsum(np.hstack(
1102            (0, colCounts, np.zeros(self.numCols - colCounts.size))))
1103        values = values[indices]
1104        rowIndices = indices % self.numRows
1105
1106        return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
1107
1108    def asML(self):
1109        """
1110        Convert this matrix to the new mllib-local representation.
1111        This does NOT copy the data; it copies references.
1112
1113        :return: :py:class:`pyspark.ml.linalg.DenseMatrix`
1114
1115        .. versionadded:: 2.0.0
1116        """
1117        return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed)
1118
1119    def __getitem__(self, indices):
1120        i, j = indices
1121        if i < 0 or i >= self.numRows:
1122            raise IndexError("Row index %d is out of range [0, %d)"
1123                             % (i, self.numRows))
1124        if j >= self.numCols or j < 0:
1125            raise IndexError("Column index %d is out of range [0, %d)"
1126                             % (j, self.numCols))
1127
1128        if self.isTransposed:
1129            return self.values[i * self.numCols + j]
1130        else:
1131            return self.values[i + j * self.numRows]
1132
1133    def __eq__(self, other):
1134        if (not isinstance(other, DenseMatrix) or
1135                self.numRows != other.numRows or
1136                self.numCols != other.numCols):
1137            return False
1138
1139        self_values = np.ravel(self.toArray(), order='F')
1140        other_values = np.ravel(other.toArray(), order='F')
1141        return all(self_values == other_values)
1142
1143
1144class SparseMatrix(Matrix):
1145    """Sparse Matrix stored in CSC format."""
1146    def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
1147                 isTransposed=False):
1148        Matrix.__init__(self, numRows, numCols, isTransposed)
1149        self.colPtrs = self._convert_to_array(colPtrs, np.int32)
1150        self.rowIndices = self._convert_to_array(rowIndices, np.int32)
1151        self.values = self._convert_to_array(values, np.float64)
1152
1153        if self.isTransposed:
1154            if self.colPtrs.size != numRows + 1:
1155                raise ValueError("Expected colPtrs of size %d, got %d."
1156                                 % (numRows + 1, self.colPtrs.size))
1157        else:
1158            if self.colPtrs.size != numCols + 1:
1159                raise ValueError("Expected colPtrs of size %d, got %d."
1160                                 % (numCols + 1, self.colPtrs.size))
1161        if self.rowIndices.size != self.values.size:
1162            raise ValueError("Expected rowIndices of length %d, got %d."
1163                             % (self.rowIndices.size, self.values.size))
1164
1165    def __str__(self):
1166        """
1167        Pretty printing of a SparseMatrix
1168
1169        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1170        >>> print(sm1)
1171        2 X 2 CSCMatrix
1172        (0,0) 2.0
1173        (1,0) 3.0
1174        (1,1) 4.0
1175        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
1176        >>> print(sm1)
1177        2 X 2 CSRMatrix
1178        (0,0) 2.0
1179        (0,1) 3.0
1180        (1,1) 4.0
1181        """
1182        spstr = "{0} X {1} ".format(self.numRows, self.numCols)
1183        if self.isTransposed:
1184            spstr += "CSRMatrix\n"
1185        else:
1186            spstr += "CSCMatrix\n"
1187
1188        cur_col = 0
1189        smlist = []
1190
1191        # Display first 16 values.
1192        if len(self.values) <= 16:
1193            zipindval = zip(self.rowIndices, self.values)
1194        else:
1195            zipindval = zip(self.rowIndices[:16], self.values[:16])
1196        for i, (rowInd, value) in enumerate(zipindval):
1197            if self.colPtrs[cur_col + 1] <= i:
1198                cur_col += 1
1199            if self.isTransposed:
1200                smlist.append('({0},{1}) {2}'.format(
1201                    cur_col, rowInd, _format_float(value)))
1202            else:
1203                smlist.append('({0},{1}) {2}'.format(
1204                    rowInd, cur_col, _format_float(value)))
1205        spstr += "\n".join(smlist)
1206
1207        if len(self.values) > 16:
1208            spstr += "\n.." * 2
1209        return spstr
1210
1211    def __repr__(self):
1212        """
1213        Representation of a SparseMatrix
1214
1215        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1216        >>> sm1
1217        SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False)
1218        """
1219        rowIndices = list(self.rowIndices)
1220        colPtrs = list(self.colPtrs)
1221
1222        if len(self.values) <= 16:
1223            values = _format_float_list(self.values)
1224
1225        else:
1226            values = (
1227                _format_float_list(self.values[:8]) +
1228                ["..."] +
1229                _format_float_list(self.values[-8:])
1230            )
1231            rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:]
1232
1233        if len(self.colPtrs) > 16:
1234            colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
1235
1236        values = ", ".join(values)
1237        rowIndices = ", ".join([str(ind) for ind in rowIndices])
1238        colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
1239        return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
1240            self.numRows, self.numCols, colPtrs, rowIndices,
1241            values, self.isTransposed)
1242
1243    def __reduce__(self):
1244        return SparseMatrix, (
1245            self.numRows, self.numCols, self.colPtrs.tostring(),
1246            self.rowIndices.tostring(), self.values.tostring(),
1247            int(self.isTransposed))
1248
1249    def __getitem__(self, indices):
1250        i, j = indices
1251        if i < 0 or i >= self.numRows:
1252            raise IndexError("Row index %d is out of range [0, %d)"
1253                             % (i, self.numRows))
1254        if j < 0 or j >= self.numCols:
1255            raise IndexError("Column index %d is out of range [0, %d)"
1256                             % (j, self.numCols))
1257
1258        # If a CSR matrix is given, then the row index should be searched
1259        # for in ColPtrs, and the column index should be searched for in the
1260        # corresponding slice obtained from rowIndices.
1261        if self.isTransposed:
1262            j, i = i, j
1263
1264        colStart = self.colPtrs[j]
1265        colEnd = self.colPtrs[j + 1]
1266        nz = self.rowIndices[colStart: colEnd]
1267        ind = np.searchsorted(nz, i) + colStart
1268        if ind < colEnd and self.rowIndices[ind] == i:
1269            return self.values[ind]
1270        else:
1271            return 0.0
1272
1273    def toArray(self):
1274        """
1275        Return an numpy.ndarray
1276        """
1277        A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
1278        for k in xrange(self.colPtrs.size - 1):
1279            startptr = self.colPtrs[k]
1280            endptr = self.colPtrs[k + 1]
1281            if self.isTransposed:
1282                A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr]
1283            else:
1284                A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr]
1285        return A
1286
1287    def toDense(self):
1288        densevals = np.ravel(self.toArray(), order='F')
1289        return DenseMatrix(self.numRows, self.numCols, densevals)
1290
1291    def asML(self):
1292        """
1293        Convert this matrix to the new mllib-local representation.
1294        This does NOT copy the data; it copies references.
1295
1296        :return: :py:class:`pyspark.ml.linalg.SparseMatrix`
1297
1298        .. versionadded:: 2.0.0
1299        """
1300        return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices,
1301                                      self.values, self.isTransposed)
1302
1303    # TODO: More efficient implementation:
1304    def __eq__(self, other):
1305        return np.all(self.toArray() == other.toArray())
1306
1307
1308class Matrices(object):
1309    @staticmethod
1310    def dense(numRows, numCols, values):
1311        """
1312        Create a DenseMatrix
1313        """
1314        return DenseMatrix(numRows, numCols, values)
1315
1316    @staticmethod
1317    def sparse(numRows, numCols, colPtrs, rowIndices, values):
1318        """
1319        Create a SparseMatrix
1320        """
1321        return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
1322
1323    @staticmethod
1324    def fromML(mat):
1325        """
1326        Convert a matrix from the new mllib-local representation.
1327        This does NOT copy the data; it copies references.
1328
1329        :param mat: a :py:class:`pyspark.ml.linalg.Matrix`
1330        :return: a :py:class:`pyspark.mllib.linalg.Matrix`
1331
1332        .. versionadded:: 2.0.0
1333        """
1334        if isinstance(mat, newlinalg.DenseMatrix):
1335            return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed)
1336        elif isinstance(mat, newlinalg.SparseMatrix):
1337            return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices,
1338                                mat.values, mat.isTransposed)
1339        else:
1340            raise TypeError("Unsupported matrix type %s" % type(mat))
1341
1342
1343class QRDecomposition(object):
1344    """
1345    Represents QR factors.
1346    """
1347    def __init__(self, Q, R):
1348        self._Q = Q
1349        self._R = R
1350
1351    @property
1352    @since('2.0.0')
1353    def Q(self):
1354        """
1355        An orthogonal matrix Q in a QR decomposition.
1356        May be null if not computed.
1357        """
1358        return self._Q
1359
1360    @property
1361    @since('2.0.0')
1362    def R(self):
1363        """
1364        An upper triangular matrix R in a QR decomposition.
1365        """
1366        return self._R
1367
1368
1369def _test():
1370    import doctest
1371    (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
1372    if failure_count:
1373        exit(-1)
1374
1375if __name__ == "__main__":
1376    _test()
1377