1#
2# Licensed to the Apache Software Foundation (ASF) under one or more
3# contributor license agreements.  See the NOTICE file distributed with
4# this work for additional information regarding copyright ownership.
5# The ASF licenses this file to You under the Apache License, Version 2.0
6# (the "License"); you may not use this file except in compliance with
7# the License.  You may obtain a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""
19MLlib utilities for linear algebra. For dense vectors, MLlib
20uses the NumPy C{array} type, so you can simply pass NumPy arrays
21around. For sparse vectors, users can construct a L{SparseVector}
22object from MLlib or pass SciPy C{scipy.sparse} column vectors if
23SciPy is available in their environment.
24"""
25
26import sys
27import array
28import struct
29
30if sys.version >= '3':
31    basestring = str
32    xrange = range
33    import copyreg as copy_reg
34    long = int
35else:
36    from itertools import izip as zip
37    import copy_reg
38
39import numpy as np
40
41from pyspark import since
42from pyspark.sql.types import UserDefinedType, StructField, StructType, ArrayType, DoubleType, \
43    IntegerType, ByteType, BooleanType
44
45
46__all__ = ['Vector', 'DenseVector', 'SparseVector', 'Vectors',
47           'Matrix', 'DenseMatrix', 'SparseMatrix', 'Matrices']
48
49
50if sys.version_info[:2] == (2, 7):
51    # speed up pickling array in Python 2.7
52    def fast_pickle_array(ar):
53        return array.array, (ar.typecode, ar.tostring())
54    copy_reg.pickle(array.array, fast_pickle_array)
55
56
57# Check whether we have SciPy. MLlib works without it too, but if we have it, some methods,
58# such as _dot and _serialize_double_vector, start to support scipy.sparse matrices.
59
60try:
61    import scipy.sparse
62    _have_scipy = True
63except:
64    # No SciPy in environment, but that's okay
65    _have_scipy = False
66
67
68def _convert_to_vector(l):
69    if isinstance(l, Vector):
70        return l
71    elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
72        return DenseVector(l)
73    elif _have_scipy and scipy.sparse.issparse(l):
74        assert l.shape[1] == 1, "Expected column vector"
75        # Make sure the converted csc_matrix has sorted indices.
76        csc = l.tocsc()
77        if not csc.has_sorted_indices:
78            csc.sort_indices()
79        return SparseVector(l.shape[0], csc.indices, csc.data)
80    else:
81        raise TypeError("Cannot convert type %s into Vector" % type(l))
82
83
84def _vector_size(v):
85    """
86    Returns the size of the vector.
87
88    >>> _vector_size([1., 2., 3.])
89    3
90    >>> _vector_size((1., 2., 3.))
91    3
92    >>> _vector_size(array.array('d', [1., 2., 3.]))
93    3
94    >>> _vector_size(np.zeros(3))
95    3
96    >>> _vector_size(np.zeros((3, 1)))
97    3
98    >>> _vector_size(np.zeros((1, 3)))
99    Traceback (most recent call last):
100        ...
101    ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
102    """
103    if isinstance(v, Vector):
104        return len(v)
105    elif type(v) in (array.array, list, tuple, xrange):
106        return len(v)
107    elif type(v) == np.ndarray:
108        if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
109            return len(v)
110        else:
111            raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
112    elif _have_scipy and scipy.sparse.issparse(v):
113        assert v.shape[1] == 1, "Expected column vector"
114        return v.shape[0]
115    else:
116        raise TypeError("Cannot treat type %s as a vector" % type(v))
117
118
119def _format_float(f, digits=4):
120    s = str(round(f, digits))
121    if '.' in s:
122        s = s[:s.index('.') + 1 + digits]
123    return s
124
125
126def _format_float_list(l):
127    return [_format_float(x) for x in l]
128
129
130def _double_to_long_bits(value):
131    if np.isnan(value):
132        value = float('nan')
133    # pack double into 64 bits, then unpack as long int
134    return struct.unpack('Q', struct.pack('d', value))[0]
135
136
137class VectorUDT(UserDefinedType):
138    """
139    SQL user-defined type (UDT) for Vector.
140    """
141
142    @classmethod
143    def sqlType(cls):
144        return StructType([
145            StructField("type", ByteType(), False),
146            StructField("size", IntegerType(), True),
147            StructField("indices", ArrayType(IntegerType(), False), True),
148            StructField("values", ArrayType(DoubleType(), False), True)])
149
150    @classmethod
151    def module(cls):
152        return "pyspark.ml.linalg"
153
154    @classmethod
155    def scalaUDT(cls):
156        return "org.apache.spark.ml.linalg.VectorUDT"
157
158    def serialize(self, obj):
159        if isinstance(obj, SparseVector):
160            indices = [int(i) for i in obj.indices]
161            values = [float(v) for v in obj.values]
162            return (0, obj.size, indices, values)
163        elif isinstance(obj, DenseVector):
164            values = [float(v) for v in obj]
165            return (1, None, None, values)
166        else:
167            raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
168
169    def deserialize(self, datum):
170        assert len(datum) == 4, \
171            "VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
172        tpe = datum[0]
173        if tpe == 0:
174            return SparseVector(datum[1], datum[2], datum[3])
175        elif tpe == 1:
176            return DenseVector(datum[3])
177        else:
178            raise ValueError("do not recognize type %r" % tpe)
179
180    def simpleString(self):
181        return "vector"
182
183
184class MatrixUDT(UserDefinedType):
185    """
186    SQL user-defined type (UDT) for Matrix.
187    """
188
189    @classmethod
190    def sqlType(cls):
191        return StructType([
192            StructField("type", ByteType(), False),
193            StructField("numRows", IntegerType(), False),
194            StructField("numCols", IntegerType(), False),
195            StructField("colPtrs", ArrayType(IntegerType(), False), True),
196            StructField("rowIndices", ArrayType(IntegerType(), False), True),
197            StructField("values", ArrayType(DoubleType(), False), True),
198            StructField("isTransposed", BooleanType(), False)])
199
200    @classmethod
201    def module(cls):
202        return "pyspark.ml.linalg"
203
204    @classmethod
205    def scalaUDT(cls):
206        return "org.apache.spark.ml.linalg.MatrixUDT"
207
208    def serialize(self, obj):
209        if isinstance(obj, SparseMatrix):
210            colPtrs = [int(i) for i in obj.colPtrs]
211            rowIndices = [int(i) for i in obj.rowIndices]
212            values = [float(v) for v in obj.values]
213            return (0, obj.numRows, obj.numCols, colPtrs,
214                    rowIndices, values, bool(obj.isTransposed))
215        elif isinstance(obj, DenseMatrix):
216            values = [float(v) for v in obj.values]
217            return (1, obj.numRows, obj.numCols, None, None, values,
218                    bool(obj.isTransposed))
219        else:
220            raise TypeError("cannot serialize type %r" % (type(obj)))
221
222    def deserialize(self, datum):
223        assert len(datum) == 7, \
224            "MatrixUDT.deserialize given row with length %d but requires 7" % len(datum)
225        tpe = datum[0]
226        if tpe == 0:
227            return SparseMatrix(*datum[1:])
228        elif tpe == 1:
229            return DenseMatrix(datum[1], datum[2], datum[5], datum[6])
230        else:
231            raise ValueError("do not recognize type %r" % tpe)
232
233    def simpleString(self):
234        return "matrix"
235
236
237class Vector(object):
238
239    __UDT__ = VectorUDT()
240
241    """
242    Abstract class for DenseVector and SparseVector
243    """
244    def toArray(self):
245        """
246        Convert the vector into an numpy.ndarray
247
248        :return: numpy.ndarray
249        """
250        raise NotImplementedError
251
252
253class DenseVector(Vector):
254    """
255    A dense vector represented by a value array. We use numpy array for
256    storage and arithmetics will be delegated to the underlying numpy
257    array.
258
259    >>> v = Vectors.dense([1.0, 2.0])
260    >>> u = Vectors.dense([3.0, 4.0])
261    >>> v + u
262    DenseVector([4.0, 6.0])
263    >>> 2 - v
264    DenseVector([1.0, 0.0])
265    >>> v / 2
266    DenseVector([0.5, 1.0])
267    >>> v * u
268    DenseVector([3.0, 8.0])
269    >>> u / v
270    DenseVector([3.0, 2.0])
271    >>> u % 2
272    DenseVector([1.0, 0.0])
273    """
274    def __init__(self, ar):
275        if isinstance(ar, bytes):
276            ar = np.frombuffer(ar, dtype=np.float64)
277        elif not isinstance(ar, np.ndarray):
278            ar = np.array(ar, dtype=np.float64)
279        if ar.dtype != np.float64:
280            ar = ar.astype(np.float64)
281        self.array = ar
282
283    def __reduce__(self):
284        return DenseVector, (self.array.tostring(),)
285
286    def numNonzeros(self):
287        """
288        Number of nonzero elements. This scans all active values and count non zeros
289        """
290        return np.count_nonzero(self.array)
291
292    def norm(self, p):
293        """
294        Calculates the norm of a DenseVector.
295
296        >>> a = DenseVector([0, -1, 2, -3])
297        >>> a.norm(2)
298        3.7...
299        >>> a.norm(1)
300        6.0
301        """
302        return np.linalg.norm(self.array, p)
303
304    def dot(self, other):
305        """
306        Compute the dot product of two Vectors. We support
307        (Numpy array, list, SparseVector, or SciPy sparse)
308        and a target NumPy array that is either 1- or 2-dimensional.
309        Equivalent to calling numpy.dot of the two vectors.
310
311        >>> dense = DenseVector(array.array('d', [1., 2.]))
312        >>> dense.dot(dense)
313        5.0
314        >>> dense.dot(SparseVector(2, [0, 1], [2., 1.]))
315        4.0
316        >>> dense.dot(range(1, 3))
317        5.0
318        >>> dense.dot(np.array(range(1, 3)))
319        5.0
320        >>> dense.dot([1.,])
321        Traceback (most recent call last):
322            ...
323        AssertionError: dimension mismatch
324        >>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
325        array([  5.,  11.])
326        >>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
327        Traceback (most recent call last):
328            ...
329        AssertionError: dimension mismatch
330        """
331        if type(other) == np.ndarray:
332            if other.ndim > 1:
333                assert len(self) == other.shape[0], "dimension mismatch"
334            return np.dot(self.array, other)
335        elif _have_scipy and scipy.sparse.issparse(other):
336            assert len(self) == other.shape[0], "dimension mismatch"
337            return other.transpose().dot(self.toArray())
338        else:
339            assert len(self) == _vector_size(other), "dimension mismatch"
340            if isinstance(other, SparseVector):
341                return other.dot(self)
342            elif isinstance(other, Vector):
343                return np.dot(self.toArray(), other.toArray())
344            else:
345                return np.dot(self.toArray(), other)
346
347    def squared_distance(self, other):
348        """
349        Squared distance of two Vectors.
350
351        >>> dense1 = DenseVector(array.array('d', [1., 2.]))
352        >>> dense1.squared_distance(dense1)
353        0.0
354        >>> dense2 = np.array([2., 1.])
355        >>> dense1.squared_distance(dense2)
356        2.0
357        >>> dense3 = [2., 1.]
358        >>> dense1.squared_distance(dense3)
359        2.0
360        >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
361        >>> dense1.squared_distance(sparse1)
362        2.0
363        >>> dense1.squared_distance([1.,])
364        Traceback (most recent call last):
365            ...
366        AssertionError: dimension mismatch
367        >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
368        Traceback (most recent call last):
369            ...
370        AssertionError: dimension mismatch
371        """
372        assert len(self) == _vector_size(other), "dimension mismatch"
373        if isinstance(other, SparseVector):
374            return other.squared_distance(self)
375        elif _have_scipy and scipy.sparse.issparse(other):
376            return _convert_to_vector(other).squared_distance(self)
377
378        if isinstance(other, Vector):
379            other = other.toArray()
380        elif not isinstance(other, np.ndarray):
381            other = np.array(other)
382        diff = self.toArray() - other
383        return np.dot(diff, diff)
384
385    def toArray(self):
386        """
387        Returns an numpy.ndarray
388        """
389        return self.array
390
391    @property
392    def values(self):
393        """
394        Returns a list of values
395        """
396        return self.array
397
398    def __getitem__(self, item):
399        return self.array[item]
400
401    def __len__(self):
402        return len(self.array)
403
404    def __str__(self):
405        return "[" + ",".join([str(v) for v in self.array]) + "]"
406
407    def __repr__(self):
408        return "DenseVector([%s])" % (', '.join(_format_float(i) for i in self.array))
409
410    def __eq__(self, other):
411        if isinstance(other, DenseVector):
412            return np.array_equal(self.array, other.array)
413        elif isinstance(other, SparseVector):
414            if len(self) != other.size:
415                return False
416            return Vectors._equals(list(xrange(len(self))), self.array, other.indices, other.values)
417        return False
418
419    def __ne__(self, other):
420        return not self == other
421
422    def __hash__(self):
423        size = len(self)
424        result = 31 + size
425        nnz = 0
426        i = 0
427        while i < size and nnz < 128:
428            if self.array[i] != 0:
429                result = 31 * result + i
430                bits = _double_to_long_bits(self.array[i])
431                result = 31 * result + (bits ^ (bits >> 32))
432                nnz += 1
433            i += 1
434        return result
435
436    def __getattr__(self, item):
437        return getattr(self.array, item)
438
439    def _delegate(op):
440        def func(self, other):
441            if isinstance(other, DenseVector):
442                other = other.array
443            return DenseVector(getattr(self.array, op)(other))
444        return func
445
446    __neg__ = _delegate("__neg__")
447    __add__ = _delegate("__add__")
448    __sub__ = _delegate("__sub__")
449    __mul__ = _delegate("__mul__")
450    __div__ = _delegate("__div__")
451    __truediv__ = _delegate("__truediv__")
452    __mod__ = _delegate("__mod__")
453    __radd__ = _delegate("__radd__")
454    __rsub__ = _delegate("__rsub__")
455    __rmul__ = _delegate("__rmul__")
456    __rdiv__ = _delegate("__rdiv__")
457    __rtruediv__ = _delegate("__rtruediv__")
458    __rmod__ = _delegate("__rmod__")
459
460
461class SparseVector(Vector):
462    """
463    A simple sparse vector class for passing data to MLlib. Users may
464    alternatively pass SciPy's {scipy.sparse} data types.
465    """
466    def __init__(self, size, *args):
467        """
468        Create a sparse vector, using either a dictionary, a list of
469        (index, value) pairs, or two separate arrays of indices and
470        values (sorted by index).
471
472        :param size: Size of the vector.
473        :param args: Active entries, as a dictionary {index: value, ...},
474          a list of tuples [(index, value), ...], or a list of strictly
475          increasing indices and a list of corresponding values [index, ...],
476          [value, ...]. Inactive entries are treated as zeros.
477
478        >>> SparseVector(4, {1: 1.0, 3: 5.5})
479        SparseVector(4, {1: 1.0, 3: 5.5})
480        >>> SparseVector(4, [(1, 1.0), (3, 5.5)])
481        SparseVector(4, {1: 1.0, 3: 5.5})
482        >>> SparseVector(4, [1, 3], [1.0, 5.5])
483        SparseVector(4, {1: 1.0, 3: 5.5})
484        >>> SparseVector(4, {1:1.0, 6:2.0})
485        Traceback (most recent call last):
486        ...
487        AssertionError: Index 6 is out of the the size of vector with size=4
488        >>> SparseVector(4, {-1:1.0})
489        Traceback (most recent call last):
490        ...
491        AssertionError: Contains negative index -1
492        """
493        self.size = int(size)
494        """ Size of the vector. """
495        assert 1 <= len(args) <= 2, "must pass either 2 or 3 arguments"
496        if len(args) == 1:
497            pairs = args[0]
498            if type(pairs) == dict:
499                pairs = pairs.items()
500            pairs = sorted(pairs)
501            self.indices = np.array([p[0] for p in pairs], dtype=np.int32)
502            """ A list of indices corresponding to active entries. """
503            self.values = np.array([p[1] for p in pairs], dtype=np.float64)
504            """ A list of values corresponding to active entries. """
505        else:
506            if isinstance(args[0], bytes):
507                assert isinstance(args[1], bytes), "values should be string too"
508                if args[0]:
509                    self.indices = np.frombuffer(args[0], np.int32)
510                    self.values = np.frombuffer(args[1], np.float64)
511                else:
512                    # np.frombuffer() doesn't work well with empty string in older version
513                    self.indices = np.array([], dtype=np.int32)
514                    self.values = np.array([], dtype=np.float64)
515            else:
516                self.indices = np.array(args[0], dtype=np.int32)
517                self.values = np.array(args[1], dtype=np.float64)
518            assert len(self.indices) == len(self.values), "index and value arrays not same length"
519            for i in xrange(len(self.indices) - 1):
520                if self.indices[i] >= self.indices[i + 1]:
521                    raise TypeError(
522                        "Indices %s and %s are not strictly increasing"
523                        % (self.indices[i], self.indices[i + 1]))
524
525        if self.indices.size > 0:
526            assert np.max(self.indices) < self.size, \
527                "Index %d is out of the the size of vector with size=%d" \
528                % (np.max(self.indices), self.size)
529            assert np.min(self.indices) >= 0, \
530                "Contains negative index %d" % (np.min(self.indices))
531
532    def numNonzeros(self):
533        """
534        Number of nonzero elements. This scans all active values and count non zeros.
535        """
536        return np.count_nonzero(self.values)
537
538    def norm(self, p):
539        """
540        Calculates the norm of a SparseVector.
541
542        >>> a = SparseVector(4, [0, 1], [3., -4.])
543        >>> a.norm(1)
544        7.0
545        >>> a.norm(2)
546        5.0
547        """
548        return np.linalg.norm(self.values, p)
549
550    def __reduce__(self):
551        return (
552            SparseVector,
553            (self.size, self.indices.tostring(), self.values.tostring()))
554
555    def dot(self, other):
556        """
557        Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
558
559        >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
560        >>> a.dot(a)
561        25.0
562        >>> a.dot(array.array('d', [1., 2., 3., 4.]))
563        22.0
564        >>> b = SparseVector(4, [2], [1.0])
565        >>> a.dot(b)
566        0.0
567        >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
568        array([ 22.,  22.])
569        >>> a.dot([1., 2., 3.])
570        Traceback (most recent call last):
571            ...
572        AssertionError: dimension mismatch
573        >>> a.dot(np.array([1., 2.]))
574        Traceback (most recent call last):
575            ...
576        AssertionError: dimension mismatch
577        >>> a.dot(DenseVector([1., 2.]))
578        Traceback (most recent call last):
579            ...
580        AssertionError: dimension mismatch
581        >>> a.dot(np.zeros((3, 2)))
582        Traceback (most recent call last):
583            ...
584        AssertionError: dimension mismatch
585        """
586
587        if isinstance(other, np.ndarray):
588            if other.ndim not in [2, 1]:
589                raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)
590            assert len(self) == other.shape[0], "dimension mismatch"
591            return np.dot(self.values, other[self.indices])
592
593        assert len(self) == _vector_size(other), "dimension mismatch"
594
595        if isinstance(other, DenseVector):
596            return np.dot(other.array[self.indices], self.values)
597
598        elif isinstance(other, SparseVector):
599            # Find out common indices.
600            self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)
601            self_values = self.values[self_cmind]
602            if self_values.size == 0:
603                return 0.0
604            else:
605                other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)
606                return np.dot(self_values, other.values[other_cmind])
607
608        else:
609            return self.dot(_convert_to_vector(other))
610
611    def squared_distance(self, other):
612        """
613        Squared distance from a SparseVector or 1-dimensional NumPy array.
614
615        >>> a = SparseVector(4, [1, 3], [3.0, 4.0])
616        >>> a.squared_distance(a)
617        0.0
618        >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))
619        11.0
620        >>> a.squared_distance(np.array([1., 2., 3., 4.]))
621        11.0
622        >>> b = SparseVector(4, [2], [1.0])
623        >>> a.squared_distance(b)
624        26.0
625        >>> b.squared_distance(a)
626        26.0
627        >>> b.squared_distance([1., 2.])
628        Traceback (most recent call last):
629            ...
630        AssertionError: dimension mismatch
631        >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
632        Traceback (most recent call last):
633            ...
634        AssertionError: dimension mismatch
635        """
636        assert len(self) == _vector_size(other), "dimension mismatch"
637
638        if isinstance(other, np.ndarray) or isinstance(other, DenseVector):
639            if isinstance(other, np.ndarray) and other.ndim != 1:
640                raise Exception("Cannot call squared_distance with %d-dimensional array" %
641                                other.ndim)
642            if isinstance(other, DenseVector):
643                other = other.array
644            sparse_ind = np.zeros(other.size, dtype=bool)
645            sparse_ind[self.indices] = True
646            dist = other[sparse_ind] - self.values
647            result = np.dot(dist, dist)
648
649            other_ind = other[~sparse_ind]
650            result += np.dot(other_ind, other_ind)
651            return result
652
653        elif isinstance(other, SparseVector):
654            result = 0.0
655            i, j = 0, 0
656            while i < len(self.indices) and j < len(other.indices):
657                if self.indices[i] == other.indices[j]:
658                    diff = self.values[i] - other.values[j]
659                    result += diff * diff
660                    i += 1
661                    j += 1
662                elif self.indices[i] < other.indices[j]:
663                    result += self.values[i] * self.values[i]
664                    i += 1
665                else:
666                    result += other.values[j] * other.values[j]
667                    j += 1
668            while i < len(self.indices):
669                result += self.values[i] * self.values[i]
670                i += 1
671            while j < len(other.indices):
672                result += other.values[j] * other.values[j]
673                j += 1
674            return result
675        else:
676            return self.squared_distance(_convert_to_vector(other))
677
678    def toArray(self):
679        """
680        Returns a copy of this SparseVector as a 1-dimensional NumPy array.
681        """
682        arr = np.zeros((self.size,), dtype=np.float64)
683        arr[self.indices] = self.values
684        return arr
685
686    def __len__(self):
687        return self.size
688
689    def __str__(self):
690        inds = "[" + ",".join([str(i) for i in self.indices]) + "]"
691        vals = "[" + ",".join([str(v) for v in self.values]) + "]"
692        return "(" + ",".join((str(self.size), inds, vals)) + ")"
693
694    def __repr__(self):
695        inds = self.indices
696        vals = self.values
697        entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
698                             for i in xrange(len(inds))])
699        return "SparseVector({0}, {{{1}}})".format(self.size, entries)
700
701    def __eq__(self, other):
702        if isinstance(other, SparseVector):
703            return other.size == self.size and np.array_equal(other.indices, self.indices) \
704                and np.array_equal(other.values, self.values)
705        elif isinstance(other, DenseVector):
706            if self.size != len(other):
707                return False
708            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
709        return False
710
711    def __getitem__(self, index):
712        inds = self.indices
713        vals = self.values
714        if not isinstance(index, int):
715            raise TypeError(
716                "Indices must be of type integer, got type %s" % type(index))
717
718        if index >= self.size or index < -self.size:
719            raise IndexError("Index %d out of bounds." % index)
720        if index < 0:
721            index += self.size
722
723        if (inds.size == 0) or (index > inds.item(-1)):
724            return 0.
725
726        insert_index = np.searchsorted(inds, index)
727        row_ind = inds[insert_index]
728        if row_ind == index:
729            return vals[insert_index]
730        return 0.
731
732    def __ne__(self, other):
733        return not self.__eq__(other)
734
735    def __hash__(self):
736        result = 31 + self.size
737        nnz = 0
738        i = 0
739        while i < len(self.values) and nnz < 128:
740            if self.values[i] != 0:
741                result = 31 * result + int(self.indices[i])
742                bits = _double_to_long_bits(self.values[i])
743                result = 31 * result + (bits ^ (bits >> 32))
744                nnz += 1
745            i += 1
746        return result
747
748
749class Vectors(object):
750
751    """
752    Factory methods for working with vectors.
753
754    .. note:: Dense vectors are simply represented as NumPy array objects,
755        so there is no need to covert them for use in MLlib. For sparse vectors,
756        the factory methods in this class create an MLlib-compatible type, or users
757        can pass in SciPy's C{scipy.sparse} column vectors.
758    """
759
760    @staticmethod
761    def sparse(size, *args):
762        """
763        Create a sparse vector, using either a dictionary, a list of
764        (index, value) pairs, or two separate arrays of indices and
765        values (sorted by index).
766
767        :param size: Size of the vector.
768        :param args: Non-zero entries, as a dictionary, list of tuples,
769                     or two sorted lists containing indices and values.
770
771        >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
772        SparseVector(4, {1: 1.0, 3: 5.5})
773        >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
774        SparseVector(4, {1: 1.0, 3: 5.5})
775        >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
776        SparseVector(4, {1: 1.0, 3: 5.5})
777        """
778        return SparseVector(size, *args)
779
780    @staticmethod
781    def dense(*elements):
782        """
783        Create a dense vector of 64-bit floats from a Python list or numbers.
784
785        >>> Vectors.dense([1, 2, 3])
786        DenseVector([1.0, 2.0, 3.0])
787        >>> Vectors.dense(1.0, 2.0)
788        DenseVector([1.0, 2.0])
789        """
790        if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):
791            # it's list, numpy.array or other iterable object.
792            elements = elements[0]
793        return DenseVector(elements)
794
795    @staticmethod
796    def squared_distance(v1, v2):
797        """
798        Squared distance between two vectors.
799        a and b can be of type SparseVector, DenseVector, np.ndarray
800        or array.array.
801
802        >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
803        >>> b = Vectors.dense([2, 5, 4, 1])
804        >>> a.squared_distance(b)
805        51.0
806        """
807        v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)
808        return v1.squared_distance(v2)
809
810    @staticmethod
811    def norm(vector, p):
812        """
813        Find norm of the given vector.
814        """
815        return _convert_to_vector(vector).norm(p)
816
817    @staticmethod
818    def zeros(size):
819        return DenseVector(np.zeros(size))
820
821    @staticmethod
822    def _equals(v1_indices, v1_values, v2_indices, v2_values):
823        """
824        Check equality between sparse/dense vectors,
825        v1_indices and v2_indices assume to be strictly increasing.
826        """
827        v1_size = len(v1_values)
828        v2_size = len(v2_values)
829        k1 = 0
830        k2 = 0
831        all_equal = True
832        while all_equal:
833            while k1 < v1_size and v1_values[k1] == 0:
834                k1 += 1
835            while k2 < v2_size and v2_values[k2] == 0:
836                k2 += 1
837
838            if k1 >= v1_size or k2 >= v2_size:
839                return k1 >= v1_size and k2 >= v2_size
840
841            all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]
842            k1 += 1
843            k2 += 1
844        return all_equal
845
846
847class Matrix(object):
848
849    __UDT__ = MatrixUDT()
850
851    """
852    Represents a local matrix.
853    """
854    def __init__(self, numRows, numCols, isTransposed=False):
855        self.numRows = numRows
856        self.numCols = numCols
857        self.isTransposed = isTransposed
858
859    def toArray(self):
860        """
861        Returns its elements in a NumPy ndarray.
862        """
863        raise NotImplementedError
864
865    @staticmethod
866    def _convert_to_array(array_like, dtype):
867        """
868        Convert Matrix attributes which are array-like or buffer to array.
869        """
870        if isinstance(array_like, bytes):
871            return np.frombuffer(array_like, dtype=dtype)
872        return np.asarray(array_like, dtype=dtype)
873
874
875class DenseMatrix(Matrix):
876    """
877    Column-major dense matrix.
878    """
879    def __init__(self, numRows, numCols, values, isTransposed=False):
880        Matrix.__init__(self, numRows, numCols, isTransposed)
881        values = self._convert_to_array(values, np.float64)
882        assert len(values) == numRows * numCols
883        self.values = values
884
885    def __reduce__(self):
886        return DenseMatrix, (
887            self.numRows, self.numCols, self.values.tostring(),
888            int(self.isTransposed))
889
890    def __str__(self):
891        """
892        Pretty printing of a DenseMatrix
893
894        >>> dm = DenseMatrix(2, 2, range(4))
895        >>> print(dm)
896        DenseMatrix([[ 0.,  2.],
897                     [ 1.,  3.]])
898        >>> dm = DenseMatrix(2, 2, range(4), isTransposed=True)
899        >>> print(dm)
900        DenseMatrix([[ 0.,  1.],
901                     [ 2.,  3.]])
902        """
903        # Inspired by __repr__ in scipy matrices.
904        array_lines = repr(self.toArray()).splitlines()
905
906        # We need to adjust six spaces which is the difference in number
907        # of letters between "DenseMatrix" and "array"
908        x = '\n'.join([(" " * 6 + line) for line in array_lines[1:]])
909        return array_lines[0].replace("array", "DenseMatrix") + "\n" + x
910
911    def __repr__(self):
912        """
913        Representation of a DenseMatrix
914
915        >>> dm = DenseMatrix(2, 2, range(4))
916        >>> dm
917        DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False)
918        """
919        # If the number of values are less than seventeen then return as it is.
920        # Else return first eight values and last eight values.
921        if len(self.values) < 17:
922            entries = _format_float_list(self.values)
923        else:
924            entries = (
925                _format_float_list(self.values[:8]) +
926                ["..."] +
927                _format_float_list(self.values[-8:])
928            )
929
930        entries = ", ".join(entries)
931        return "DenseMatrix({0}, {1}, [{2}], {3})".format(
932            self.numRows, self.numCols, entries, self.isTransposed)
933
934    def toArray(self):
935        """
936        Return an numpy.ndarray
937
938        >>> m = DenseMatrix(2, 2, range(4))
939        >>> m.toArray()
940        array([[ 0.,  2.],
941               [ 1.,  3.]])
942        """
943        if self.isTransposed:
944            return np.asfortranarray(
945                self.values.reshape((self.numRows, self.numCols)))
946        else:
947            return self.values.reshape((self.numRows, self.numCols), order='F')
948
949    def toSparse(self):
950        """Convert to SparseMatrix"""
951        if self.isTransposed:
952            values = np.ravel(self.toArray(), order='F')
953        else:
954            values = self.values
955        indices = np.nonzero(values)[0]
956        colCounts = np.bincount(indices // self.numRows)
957        colPtrs = np.cumsum(np.hstack(
958            (0, colCounts, np.zeros(self.numCols - colCounts.size))))
959        values = values[indices]
960        rowIndices = indices % self.numRows
961
962        return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
963
964    def __getitem__(self, indices):
965        i, j = indices
966        if i < 0 or i >= self.numRows:
967            raise IndexError("Row index %d is out of range [0, %d)"
968                             % (i, self.numRows))
969        if j >= self.numCols or j < 0:
970            raise IndexError("Column index %d is out of range [0, %d)"
971                             % (j, self.numCols))
972
973        if self.isTransposed:
974            return self.values[i * self.numCols + j]
975        else:
976            return self.values[i + j * self.numRows]
977
978    def __eq__(self, other):
979        if (not isinstance(other, DenseMatrix) or
980                self.numRows != other.numRows or
981                self.numCols != other.numCols):
982            return False
983
984        self_values = np.ravel(self.toArray(), order='F')
985        other_values = np.ravel(other.toArray(), order='F')
986        return all(self_values == other_values)
987
988
989class SparseMatrix(Matrix):
990    """Sparse Matrix stored in CSC format."""
991    def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
992                 isTransposed=False):
993        Matrix.__init__(self, numRows, numCols, isTransposed)
994        self.colPtrs = self._convert_to_array(colPtrs, np.int32)
995        self.rowIndices = self._convert_to_array(rowIndices, np.int32)
996        self.values = self._convert_to_array(values, np.float64)
997
998        if self.isTransposed:
999            if self.colPtrs.size != numRows + 1:
1000                raise ValueError("Expected colPtrs of size %d, got %d."
1001                                 % (numRows + 1, self.colPtrs.size))
1002        else:
1003            if self.colPtrs.size != numCols + 1:
1004                raise ValueError("Expected colPtrs of size %d, got %d."
1005                                 % (numCols + 1, self.colPtrs.size))
1006        if self.rowIndices.size != self.values.size:
1007            raise ValueError("Expected rowIndices of length %d, got %d."
1008                             % (self.rowIndices.size, self.values.size))
1009
1010    def __str__(self):
1011        """
1012        Pretty printing of a SparseMatrix
1013
1014        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1015        >>> print(sm1)
1016        2 X 2 CSCMatrix
1017        (0,0) 2.0
1018        (1,0) 3.0
1019        (1,1) 4.0
1020        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
1021        >>> print(sm1)
1022        2 X 2 CSRMatrix
1023        (0,0) 2.0
1024        (0,1) 3.0
1025        (1,1) 4.0
1026        """
1027        spstr = "{0} X {1} ".format(self.numRows, self.numCols)
1028        if self.isTransposed:
1029            spstr += "CSRMatrix\n"
1030        else:
1031            spstr += "CSCMatrix\n"
1032
1033        cur_col = 0
1034        smlist = []
1035
1036        # Display first 16 values.
1037        if len(self.values) <= 16:
1038            zipindval = zip(self.rowIndices, self.values)
1039        else:
1040            zipindval = zip(self.rowIndices[:16], self.values[:16])
1041        for i, (rowInd, value) in enumerate(zipindval):
1042            if self.colPtrs[cur_col + 1] <= i:
1043                cur_col += 1
1044            if self.isTransposed:
1045                smlist.append('({0},{1}) {2}'.format(
1046                    cur_col, rowInd, _format_float(value)))
1047            else:
1048                smlist.append('({0},{1}) {2}'.format(
1049                    rowInd, cur_col, _format_float(value)))
1050        spstr += "\n".join(smlist)
1051
1052        if len(self.values) > 16:
1053            spstr += "\n.." * 2
1054        return spstr
1055
1056    def __repr__(self):
1057        """
1058        Representation of a SparseMatrix
1059
1060        >>> sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
1061        >>> sm1
1062        SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2.0, 3.0, 4.0], False)
1063        """
1064        rowIndices = list(self.rowIndices)
1065        colPtrs = list(self.colPtrs)
1066
1067        if len(self.values) <= 16:
1068            values = _format_float_list(self.values)
1069
1070        else:
1071            values = (
1072                _format_float_list(self.values[:8]) +
1073                ["..."] +
1074                _format_float_list(self.values[-8:])
1075            )
1076            rowIndices = rowIndices[:8] + ["..."] + rowIndices[-8:]
1077
1078        if len(self.colPtrs) > 16:
1079            colPtrs = colPtrs[:8] + ["..."] + colPtrs[-8:]
1080
1081        values = ", ".join(values)
1082        rowIndices = ", ".join([str(ind) for ind in rowIndices])
1083        colPtrs = ", ".join([str(ptr) for ptr in colPtrs])
1084        return "SparseMatrix({0}, {1}, [{2}], [{3}], [{4}], {5})".format(
1085            self.numRows, self.numCols, colPtrs, rowIndices,
1086            values, self.isTransposed)
1087
1088    def __reduce__(self):
1089        return SparseMatrix, (
1090            self.numRows, self.numCols, self.colPtrs.tostring(),
1091            self.rowIndices.tostring(), self.values.tostring(),
1092            int(self.isTransposed))
1093
1094    def __getitem__(self, indices):
1095        i, j = indices
1096        if i < 0 or i >= self.numRows:
1097            raise IndexError("Row index %d is out of range [0, %d)"
1098                             % (i, self.numRows))
1099        if j < 0 or j >= self.numCols:
1100            raise IndexError("Column index %d is out of range [0, %d)"
1101                             % (j, self.numCols))
1102
1103        # If a CSR matrix is given, then the row index should be searched
1104        # for in ColPtrs, and the column index should be searched for in the
1105        # corresponding slice obtained from rowIndices.
1106        if self.isTransposed:
1107            j, i = i, j
1108
1109        colStart = self.colPtrs[j]
1110        colEnd = self.colPtrs[j + 1]
1111        nz = self.rowIndices[colStart: colEnd]
1112        ind = np.searchsorted(nz, i) + colStart
1113        if ind < colEnd and self.rowIndices[ind] == i:
1114            return self.values[ind]
1115        else:
1116            return 0.0
1117
1118    def toArray(self):
1119        """
1120        Return an numpy.ndarray
1121        """
1122        A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')
1123        for k in xrange(self.colPtrs.size - 1):
1124            startptr = self.colPtrs[k]
1125            endptr = self.colPtrs[k + 1]
1126            if self.isTransposed:
1127                A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr]
1128            else:
1129                A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr]
1130        return A
1131
1132    def toDense(self):
1133        densevals = np.ravel(self.toArray(), order='F')
1134        return DenseMatrix(self.numRows, self.numCols, densevals)
1135
1136    # TODO: More efficient implementation:
1137    def __eq__(self, other):
1138        return np.all(self.toArray() == other.toArray())
1139
1140
1141class Matrices(object):
1142    @staticmethod
1143    def dense(numRows, numCols, values):
1144        """
1145        Create a DenseMatrix
1146        """
1147        return DenseMatrix(numRows, numCols, values)
1148
1149    @staticmethod
1150    def sparse(numRows, numCols, colPtrs, rowIndices, values):
1151        """
1152        Create a SparseMatrix
1153        """
1154        return SparseMatrix(numRows, numCols, colPtrs, rowIndices, values)
1155
1156
1157def _test():
1158    import doctest
1159    (failure_count, test_count) = doctest.testmod(optionflags=doctest.ELLIPSIS)
1160    if failure_count:
1161        exit(-1)
1162
1163if __name__ == "__main__":
1164    _test()
1165