1/*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements.  See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License.  You may obtain a copy of the License at
8 *
9 *    http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18package org.apache.spark.mllib.linalg
19
20import java.lang.{Double => JavaDouble, Integer => JavaInteger, Iterable => JavaIterable}
21import java.util
22
23import scala.annotation.varargs
24import scala.collection.JavaConverters._
25import scala.language.implicitConversions
26
27import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
28import org.json4s.DefaultFormats
29import org.json4s.JsonDSL._
30import org.json4s.jackson.JsonMethods.{compact, parse => parseJson, render}
31
32import org.apache.spark.SparkException
33import org.apache.spark.annotation.{AlphaComponent, Since}
34import org.apache.spark.ml.{linalg => newlinalg}
35import org.apache.spark.mllib.util.NumericParser
36import org.apache.spark.sql.catalyst.InternalRow
37import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData}
38import org.apache.spark.sql.types._
39
40/**
41 * Represents a numeric vector, whose index type is Int and value type is Double.
42 *
43 * @note Users should not implement this interface.
44 */
45@SQLUserDefinedType(udt = classOf[VectorUDT])
46@Since("1.0.0")
47sealed trait Vector extends Serializable {
48
49  /**
50   * Size of the vector.
51   */
52  @Since("1.0.0")
53  def size: Int
54
55  /**
56   * Converts the instance to a double array.
57   */
58  @Since("1.0.0")
59  def toArray: Array[Double]
60
61  override def equals(other: Any): Boolean = {
62    other match {
63      case v2: Vector =>
64        if (this.size != v2.size) return false
65        (this, v2) match {
66          case (s1: SparseVector, s2: SparseVector) =>
67            Vectors.equals(s1.indices, s1.values, s2.indices, s2.values)
68          case (s1: SparseVector, d1: DenseVector) =>
69            Vectors.equals(s1.indices, s1.values, 0 until d1.size, d1.values)
70          case (d1: DenseVector, s1: SparseVector) =>
71            Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values)
72          case (_, _) => util.Arrays.equals(this.toArray, v2.toArray)
73        }
74      case _ => false
75    }
76  }
77
78  /**
79   * Returns a hash code value for the vector. The hash code is based on its size and its first 128
80   * nonzero entries, using a hash algorithm similar to `java.util.Arrays.hashCode`.
81   */
82  override def hashCode(): Int = {
83    // This is a reference implementation. It calls return in foreachActive, which is slow.
84    // Subclasses should override it with optimized implementation.
85    var result: Int = 31 + size
86    var nnz = 0
87    this.foreachActive { (index, value) =>
88      if (nnz < Vectors.MAX_HASH_NNZ) {
89        // ignore explicit 0 for comparison between sparse and dense
90        if (value != 0) {
91          result = 31 * result + index
92          val bits = java.lang.Double.doubleToLongBits(value)
93          result = 31 * result + (bits ^ (bits >>> 32)).toInt
94          nnz += 1
95        }
96      } else {
97        return result
98      }
99    }
100    result
101  }
102
103  /**
104   * Converts the instance to a breeze vector.
105   */
106  private[spark] def asBreeze: BV[Double]
107
108  /**
109   * Gets the value of the ith element.
110   * @param i index
111   */
112  @Since("1.1.0")
113  def apply(i: Int): Double = asBreeze(i)
114
115  /**
116   * Makes a deep copy of this vector.
117   */
118  @Since("1.1.0")
119  def copy: Vector = {
120    throw new NotImplementedError(s"copy is not implemented for ${this.getClass}.")
121  }
122
123  /**
124   * Applies a function `f` to all the active elements of dense and sparse vector.
125   *
126   * @param f the function takes two parameters where the first parameter is the index of
127   *          the vector with type `Int`, and the second parameter is the corresponding value
128   *          with type `Double`.
129   */
130  @Since("1.6.0")
131  def foreachActive(f: (Int, Double) => Unit): Unit
132
133  /**
134   * Number of active entries.  An "active entry" is an element which is explicitly stored,
135   * regardless of its value.
136   *
137   * @note Inactive entries have value 0.
138   */
139  @Since("1.4.0")
140  def numActives: Int
141
142  /**
143   * Number of nonzero elements. This scans all active values and count nonzeros.
144   */
145  @Since("1.4.0")
146  def numNonzeros: Int
147
148  /**
149   * Converts this vector to a sparse vector with all explicit zeros removed.
150   */
151  @Since("1.4.0")
152  def toSparse: SparseVector
153
154  /**
155   * Converts this vector to a dense vector.
156   */
157  @Since("1.4.0")
158  def toDense: DenseVector = new DenseVector(this.toArray)
159
160  /**
161   * Returns a vector in either dense or sparse format, whichever uses less storage.
162   */
163  @Since("1.4.0")
164  def compressed: Vector = {
165    val nnz = numNonzeros
166    // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
167    if (1.5 * (nnz + 1.0) < size) {
168      toSparse
169    } else {
170      toDense
171    }
172  }
173
174  /**
175   * Find the index of a maximal element.  Returns the first maximal element in case of a tie.
176   * Returns -1 if vector has length 0.
177   */
178  @Since("1.5.0")
179  def argmax: Int
180
181  /**
182   * Converts the vector to a JSON string.
183   */
184  @Since("1.6.0")
185  def toJson: String
186
187  /**
188   * Convert this vector to the new mllib-local representation.
189   * This does NOT copy the data; it copies references.
190   */
191  @Since("2.0.0")
192  def asML: newlinalg.Vector
193}
194
195/**
196 * :: AlphaComponent ::
197 *
198 * User-defined type for [[Vector]] which allows easy interaction with SQL
199 * via [[org.apache.spark.sql.Dataset]].
200 */
201@AlphaComponent
202class VectorUDT extends UserDefinedType[Vector] {
203
204  override def sqlType: StructType = {
205    // type: 0 = sparse, 1 = dense
206    // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse
207    // vectors. The "values" field is nullable because we might want to add binary vectors later,
208    // which uses "size" and "indices", but not "values".
209    StructType(Seq(
210      StructField("type", ByteType, nullable = false),
211      StructField("size", IntegerType, nullable = true),
212      StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
213      StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true)))
214  }
215
216  override def serialize(obj: Vector): InternalRow = {
217    obj match {
218      case SparseVector(size, indices, values) =>
219        val row = new GenericInternalRow(4)
220        row.setByte(0, 0)
221        row.setInt(1, size)
222        row.update(2, UnsafeArrayData.fromPrimitiveArray(indices))
223        row.update(3, UnsafeArrayData.fromPrimitiveArray(values))
224        row
225      case DenseVector(values) =>
226        val row = new GenericInternalRow(4)
227        row.setByte(0, 1)
228        row.setNullAt(1)
229        row.setNullAt(2)
230        row.update(3, UnsafeArrayData.fromPrimitiveArray(values))
231        row
232    }
233  }
234
235  override def deserialize(datum: Any): Vector = {
236    datum match {
237      case row: InternalRow =>
238        require(row.numFields == 4,
239          s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4")
240        val tpe = row.getByte(0)
241        tpe match {
242          case 0 =>
243            val size = row.getInt(1)
244            val indices = row.getArray(2).toIntArray()
245            val values = row.getArray(3).toDoubleArray()
246            new SparseVector(size, indices, values)
247          case 1 =>
248            val values = row.getArray(3).toDoubleArray()
249            new DenseVector(values)
250        }
251    }
252  }
253
254  override def pyUDT: String = "pyspark.mllib.linalg.VectorUDT"
255
256  override def userClass: Class[Vector] = classOf[Vector]
257
258  override def equals(o: Any): Boolean = {
259    o match {
260      case v: VectorUDT => true
261      case _ => false
262    }
263  }
264
265  // see [SPARK-8647], this achieves the needed constant hash code without constant no.
266  override def hashCode(): Int = classOf[VectorUDT].getName.hashCode()
267
268  override def typeName: String = "vector"
269
270  private[spark] override def asNullable: VectorUDT = this
271}
272
273/**
274 * Factory methods for [[org.apache.spark.mllib.linalg.Vector]].
275 * We don't use the name `Vector` because Scala imports
276 * [[scala.collection.immutable.Vector]] by default.
277 */
278@Since("1.0.0")
279object Vectors {
280
281  /**
282   * Creates a dense vector from its values.
283   */
284  @Since("1.0.0")
285  @varargs
286  def dense(firstValue: Double, otherValues: Double*): Vector =
287    new DenseVector((firstValue +: otherValues).toArray)
288
289  // A dummy implicit is used to avoid signature collision with the one generated by @varargs.
290  /**
291   * Creates a dense vector from a double array.
292   */
293  @Since("1.0.0")
294  def dense(values: Array[Double]): Vector = new DenseVector(values)
295
296  /**
297   * Creates a sparse vector providing its index array and value array.
298   *
299   * @param size vector size.
300   * @param indices index array, must be strictly increasing.
301   * @param values value array, must have the same length as indices.
302   */
303  @Since("1.0.0")
304  def sparse(size: Int, indices: Array[Int], values: Array[Double]): Vector =
305    new SparseVector(size, indices, values)
306
307  /**
308   * Creates a sparse vector using unordered (index, value) pairs.
309   *
310   * @param size vector size.
311   * @param elements vector elements in (index, value) pairs.
312   */
313  @Since("1.0.0")
314  def sparse(size: Int, elements: Seq[(Int, Double)]): Vector = {
315    require(size > 0, "The size of the requested sparse vector must be greater than 0.")
316
317    val (indices, values) = elements.sortBy(_._1).unzip
318    var prev = -1
319    indices.foreach { i =>
320      require(prev < i, s"Found duplicate indices: $i.")
321      prev = i
322    }
323    require(prev < size, s"You may not write an element to index $prev because the declared " +
324      s"size of your vector is $size")
325
326    new SparseVector(size, indices.toArray, values.toArray)
327  }
328
329  /**
330   * Creates a sparse vector using unordered (index, value) pairs in a Java friendly way.
331   *
332   * @param size vector size.
333   * @param elements vector elements in (index, value) pairs.
334   */
335  @Since("1.0.0")
336  def sparse(size: Int, elements: JavaIterable[(JavaInteger, JavaDouble)]): Vector = {
337    sparse(size, elements.asScala.map { case (i, x) =>
338      (i.intValue(), x.doubleValue())
339    }.toSeq)
340  }
341
342  /**
343   * Creates a vector of all zeros.
344   *
345   * @param size vector size
346   * @return a zero vector
347   */
348  @Since("1.1.0")
349  def zeros(size: Int): Vector = {
350    new DenseVector(new Array[Double](size))
351  }
352
353  /**
354   * Parses a string resulted from `Vector.toString` into a [[Vector]].
355   */
356  @Since("1.1.0")
357  def parse(s: String): Vector = {
358    parseNumeric(NumericParser.parse(s))
359  }
360
361  /**
362   * Parses the JSON representation of a vector into a [[Vector]].
363   */
364  @Since("1.6.0")
365  def fromJson(json: String): Vector = {
366    implicit val formats = DefaultFormats
367    val jValue = parseJson(json)
368    (jValue \ "type").extract[Int] match {
369      case 0 => // sparse
370        val size = (jValue \ "size").extract[Int]
371        val indices = (jValue \ "indices").extract[Seq[Int]].toArray
372        val values = (jValue \ "values").extract[Seq[Double]].toArray
373        sparse(size, indices, values)
374      case 1 => // dense
375        val values = (jValue \ "values").extract[Seq[Double]].toArray
376        dense(values)
377      case _ =>
378        throw new IllegalArgumentException(s"Cannot parse $json into a vector.")
379    }
380  }
381
382  private[mllib] def parseNumeric(any: Any): Vector = {
383    any match {
384      case values: Array[Double] =>
385        Vectors.dense(values)
386      case Seq(size: Double, indices: Array[Double], values: Array[Double]) =>
387        Vectors.sparse(size.toInt, indices.map(_.toInt), values)
388      case other =>
389        throw new SparkException(s"Cannot parse $other.")
390    }
391  }
392
393  /**
394   * Creates a vector instance from a breeze vector.
395   */
396  private[spark] def fromBreeze(breezeVector: BV[Double]): Vector = {
397    breezeVector match {
398      case v: BDV[Double] =>
399        if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
400          new DenseVector(v.data)
401        } else {
402          new DenseVector(v.toArray)  // Can't use underlying array directly, so make a new one
403        }
404      case v: BSV[Double] =>
405        if (v.index.length == v.used) {
406          new SparseVector(v.length, v.index, v.data)
407        } else {
408          new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
409        }
410      case v: BV[_] =>
411        sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
412    }
413  }
414
415  /**
416   * Returns the p-norm of this vector.
417   * @param vector input vector.
418   * @param p norm.
419   * @return norm in L^p^ space.
420   */
421  @Since("1.3.0")
422  def norm(vector: Vector, p: Double): Double = {
423    require(p >= 1.0, "To compute the p-norm of the vector, we require that you specify a p>=1. " +
424      s"You specified p=$p.")
425    val values = vector match {
426      case DenseVector(vs) => vs
427      case SparseVector(n, ids, vs) => vs
428      case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
429    }
430    val size = values.length
431
432    if (p == 1) {
433      var sum = 0.0
434      var i = 0
435      while (i < size) {
436        sum += math.abs(values(i))
437        i += 1
438      }
439      sum
440    } else if (p == 2) {
441      var sum = 0.0
442      var i = 0
443      while (i < size) {
444        sum += values(i) * values(i)
445        i += 1
446      }
447      math.sqrt(sum)
448    } else if (p == Double.PositiveInfinity) {
449      var max = 0.0
450      var i = 0
451      while (i < size) {
452        val value = math.abs(values(i))
453        if (value > max) max = value
454        i += 1
455      }
456      max
457    } else {
458      var sum = 0.0
459      var i = 0
460      while (i < size) {
461        sum += math.pow(math.abs(values(i)), p)
462        i += 1
463      }
464      math.pow(sum, 1.0 / p)
465    }
466  }
467
468  /**
469   * Returns the squared distance between two Vectors.
470   * @param v1 first Vector.
471   * @param v2 second Vector.
472   * @return squared distance between two Vectors.
473   */
474  @Since("1.3.0")
475  def sqdist(v1: Vector, v2: Vector): Double = {
476    require(v1.size == v2.size, s"Vector dimensions do not match: Dim(v1)=${v1.size} and Dim(v2)" +
477      s"=${v2.size}.")
478    var squaredDistance = 0.0
479    (v1, v2) match {
480      case (v1: SparseVector, v2: SparseVector) =>
481        val v1Values = v1.values
482        val v1Indices = v1.indices
483        val v2Values = v2.values
484        val v2Indices = v2.indices
485        val nnzv1 = v1Indices.length
486        val nnzv2 = v2Indices.length
487
488        var kv1 = 0
489        var kv2 = 0
490        while (kv1 < nnzv1 || kv2 < nnzv2) {
491          var score = 0.0
492
493          if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
494            score = v1Values(kv1)
495            kv1 += 1
496          } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
497            score = v2Values(kv2)
498            kv2 += 1
499          } else {
500            score = v1Values(kv1) - v2Values(kv2)
501            kv1 += 1
502            kv2 += 1
503          }
504          squaredDistance += score * score
505        }
506
507      case (v1: SparseVector, v2: DenseVector) =>
508        squaredDistance = sqdist(v1, v2)
509
510      case (v1: DenseVector, v2: SparseVector) =>
511        squaredDistance = sqdist(v2, v1)
512
513      case (DenseVector(vv1), DenseVector(vv2)) =>
514        var kv = 0
515        val sz = vv1.length
516        while (kv < sz) {
517          val score = vv1(kv) - vv2(kv)
518          squaredDistance += score * score
519          kv += 1
520        }
521      case _ =>
522        throw new IllegalArgumentException("Do not support vector type " + v1.getClass +
523          " and " + v2.getClass)
524    }
525    squaredDistance
526  }
527
528  /**
529   * Returns the squared distance between DenseVector and SparseVector.
530   */
531  private[mllib] def sqdist(v1: SparseVector, v2: DenseVector): Double = {
532    var kv1 = 0
533    var kv2 = 0
534    val indices = v1.indices
535    var squaredDistance = 0.0
536    val nnzv1 = indices.length
537    val nnzv2 = v2.size
538    var iv1 = if (nnzv1 > 0) indices(kv1) else -1
539
540    while (kv2 < nnzv2) {
541      var score = 0.0
542      if (kv2 != iv1) {
543        score = v2(kv2)
544      } else {
545        score = v1.values(kv1) - v2(kv2)
546        if (kv1 < nnzv1 - 1) {
547          kv1 += 1
548          iv1 = indices(kv1)
549        }
550      }
551      squaredDistance += score * score
552      kv2 += 1
553    }
554    squaredDistance
555  }
556
557  /**
558   * Check equality between sparse/dense vectors
559   */
560  private[mllib] def equals(
561      v1Indices: IndexedSeq[Int],
562      v1Values: Array[Double],
563      v2Indices: IndexedSeq[Int],
564      v2Values: Array[Double]): Boolean = {
565    val v1Size = v1Values.length
566    val v2Size = v2Values.length
567    var k1 = 0
568    var k2 = 0
569    var allEqual = true
570    while (allEqual) {
571      while (k1 < v1Size && v1Values(k1) == 0) k1 += 1
572      while (k2 < v2Size && v2Values(k2) == 0) k2 += 1
573
574      if (k1 >= v1Size || k2 >= v2Size) {
575        return k1 >= v1Size && k2 >= v2Size // check end alignment
576      }
577      allEqual = v1Indices(k1) == v2Indices(k2) && v1Values(k1) == v2Values(k2)
578      k1 += 1
579      k2 += 1
580    }
581    allEqual
582  }
583
584  /** Max number of nonzero entries used in computing hash code. */
585  private[linalg] val MAX_HASH_NNZ = 128
586
587  /**
588   * Convert new linalg type to spark.mllib type.  Light copy; only copies references
589   */
590  @Since("2.0.0")
591  def fromML(v: newlinalg.Vector): Vector = v match {
592    case dv: newlinalg.DenseVector =>
593      DenseVector.fromML(dv)
594    case sv: newlinalg.SparseVector =>
595      SparseVector.fromML(sv)
596  }
597}
598
599/**
600 * A dense vector represented by a value array.
601 */
602@Since("1.0.0")
603@SQLUserDefinedType(udt = classOf[VectorUDT])
604class DenseVector @Since("1.0.0") (
605    @Since("1.0.0") val values: Array[Double]) extends Vector {
606
607  @Since("1.0.0")
608  override def size: Int = values.length
609
610  override def toString: String = values.mkString("[", ",", "]")
611
612  @Since("1.0.0")
613  override def toArray: Array[Double] = values
614
615  private[spark] override def asBreeze: BV[Double] = new BDV[Double](values)
616
617  @Since("1.0.0")
618  override def apply(i: Int): Double = values(i)
619
620  @Since("1.1.0")
621  override def copy: DenseVector = {
622    new DenseVector(values.clone())
623  }
624
625  @Since("1.6.0")
626  override def foreachActive(f: (Int, Double) => Unit): Unit = {
627    var i = 0
628    val localValuesSize = values.length
629    val localValues = values
630
631    while (i < localValuesSize) {
632      f(i, localValues(i))
633      i += 1
634    }
635  }
636
637  override def equals(other: Any): Boolean = super.equals(other)
638
639  override def hashCode(): Int = {
640    var result: Int = 31 + size
641    var i = 0
642    val end = values.length
643    var nnz = 0
644    while (i < end && nnz < Vectors.MAX_HASH_NNZ) {
645      val v = values(i)
646      if (v != 0.0) {
647        result = 31 * result + i
648        val bits = java.lang.Double.doubleToLongBits(values(i))
649        result = 31 * result + (bits ^ (bits >>> 32)).toInt
650        nnz += 1
651      }
652      i += 1
653    }
654    result
655  }
656
657  @Since("1.4.0")
658  override def numActives: Int = size
659
660  @Since("1.4.0")
661  override def numNonzeros: Int = {
662    // same as values.count(_ != 0.0) but faster
663    var nnz = 0
664    values.foreach { v =>
665      if (v != 0.0) {
666        nnz += 1
667      }
668    }
669    nnz
670  }
671
672  @Since("1.4.0")
673  override def toSparse: SparseVector = {
674    val nnz = numNonzeros
675    val ii = new Array[Int](nnz)
676    val vv = new Array[Double](nnz)
677    var k = 0
678    foreachActive { (i, v) =>
679      if (v != 0) {
680        ii(k) = i
681        vv(k) = v
682        k += 1
683      }
684    }
685    new SparseVector(size, ii, vv)
686  }
687
688  @Since("1.5.0")
689  override def argmax: Int = {
690    if (size == 0) {
691      -1
692    } else {
693      var maxIdx = 0
694      var maxValue = values(0)
695      var i = 1
696      while (i < size) {
697        if (values(i) > maxValue) {
698          maxIdx = i
699          maxValue = values(i)
700        }
701        i += 1
702      }
703      maxIdx
704    }
705  }
706
707  @Since("1.6.0")
708  override def toJson: String = {
709    val jValue = ("type" -> 1) ~ ("values" -> values.toSeq)
710    compact(render(jValue))
711  }
712
713  @Since("2.0.0")
714  override def asML: newlinalg.DenseVector = {
715    new newlinalg.DenseVector(values)
716  }
717}
718
719@Since("1.3.0")
720object DenseVector {
721
722  /** Extracts the value array from a dense vector. */
723  @Since("1.3.0")
724  def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values)
725
726  /**
727   * Convert new linalg type to spark.mllib type.  Light copy; only copies references
728   */
729  @Since("2.0.0")
730  def fromML(v: newlinalg.DenseVector): DenseVector = {
731    new DenseVector(v.values)
732  }
733}
734
735/**
736 * A sparse vector represented by an index array and a value array.
737 *
738 * @param size size of the vector.
739 * @param indices index array, assume to be strictly increasing.
740 * @param values value array, must have the same length as the index array.
741 */
742@Since("1.0.0")
743@SQLUserDefinedType(udt = classOf[VectorUDT])
744class SparseVector @Since("1.0.0") (
745    @Since("1.0.0") override val size: Int,
746    @Since("1.0.0") val indices: Array[Int],
747    @Since("1.0.0") val values: Array[Double]) extends Vector {
748
749  require(indices.length == values.length, "Sparse vectors require that the dimension of the" +
750    s" indices match the dimension of the values. You provided ${indices.length} indices and " +
751    s" ${values.length} values.")
752  require(indices.length <= size, s"You provided ${indices.length} indices and values, " +
753    s"which exceeds the specified vector size ${size}.")
754
755  override def toString: String =
756    s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})"
757
758  @Since("1.0.0")
759  override def toArray: Array[Double] = {
760    val data = new Array[Double](size)
761    var i = 0
762    val nnz = indices.length
763    while (i < nnz) {
764      data(indices(i)) = values(i)
765      i += 1
766    }
767    data
768  }
769
770  @Since("1.1.0")
771  override def copy: SparseVector = {
772    new SparseVector(size, indices.clone(), values.clone())
773  }
774
775  private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size)
776
777  @Since("1.6.0")
778  override def foreachActive(f: (Int, Double) => Unit): Unit = {
779    var i = 0
780    val localValuesSize = values.length
781    val localIndices = indices
782    val localValues = values
783
784    while (i < localValuesSize) {
785      f(localIndices(i), localValues(i))
786      i += 1
787    }
788  }
789
790  override def equals(other: Any): Boolean = super.equals(other)
791
792  override def hashCode(): Int = {
793    var result: Int = 31 + size
794    val end = values.length
795    var k = 0
796    var nnz = 0
797    while (k < end && nnz < Vectors.MAX_HASH_NNZ) {
798      val v = values(k)
799      if (v != 0.0) {
800        val i = indices(k)
801        result = 31 * result + i
802        val bits = java.lang.Double.doubleToLongBits(v)
803        result = 31 * result + (bits ^ (bits >>> 32)).toInt
804        nnz += 1
805      }
806      k += 1
807    }
808    result
809  }
810
811  @Since("1.4.0")
812  override def numActives: Int = values.length
813
814  @Since("1.4.0")
815  override def numNonzeros: Int = {
816    var nnz = 0
817    values.foreach { v =>
818      if (v != 0.0) {
819        nnz += 1
820      }
821    }
822    nnz
823  }
824
825  @Since("1.4.0")
826  override def toSparse: SparseVector = {
827    val nnz = numNonzeros
828    if (nnz == numActives) {
829      this
830    } else {
831      val ii = new Array[Int](nnz)
832      val vv = new Array[Double](nnz)
833      var k = 0
834      foreachActive { (i, v) =>
835        if (v != 0.0) {
836          ii(k) = i
837          vv(k) = v
838          k += 1
839        }
840      }
841      new SparseVector(size, ii, vv)
842    }
843  }
844
845  @Since("1.5.0")
846  override def argmax: Int = {
847    if (size == 0) {
848      -1
849    } else {
850      // Find the max active entry.
851      var maxIdx = indices(0)
852      var maxValue = values(0)
853      var maxJ = 0
854      var j = 1
855      val na = numActives
856      while (j < na) {
857        val v = values(j)
858        if (v > maxValue) {
859          maxValue = v
860          maxIdx = indices(j)
861          maxJ = j
862        }
863        j += 1
864      }
865
866      // If the max active entry is nonpositive and there exists inactive ones, find the first zero.
867      if (maxValue <= 0.0 && na < size) {
868        if (maxValue == 0.0) {
869          // If there exists an inactive entry before maxIdx, find it and return its index.
870          if (maxJ < maxIdx) {
871            var k = 0
872            while (k < maxJ && indices(k) == k) {
873              k += 1
874            }
875            maxIdx = k
876          }
877        } else {
878          // If the max active value is negative, find and return the first inactive index.
879          var k = 0
880          while (k < na && indices(k) == k) {
881            k += 1
882          }
883          maxIdx = k
884        }
885      }
886
887      maxIdx
888    }
889  }
890
891  /**
892   * Create a slice of this vector based on the given indices.
893   * @param selectedIndices Unsorted list of indices into the vector.
894   *                        This does NOT do bound checking.
895   * @return  New SparseVector with values in the order specified by the given indices.
896   *
897   * NOTE: The API needs to be discussed before making this public.
898   *       Also, if we have a version assuming indices are sorted, we should optimize it.
899   */
900  private[spark] def slice(selectedIndices: Array[Int]): SparseVector = {
901    var currentIdx = 0
902    val (sliceInds, sliceVals) = selectedIndices.flatMap { origIdx =>
903      val iIdx = java.util.Arrays.binarySearch(this.indices, origIdx)
904      val i_v = if (iIdx >= 0) {
905        Iterator((currentIdx, this.values(iIdx)))
906      } else {
907        Iterator()
908      }
909      currentIdx += 1
910      i_v
911    }.unzip
912    new SparseVector(selectedIndices.length, sliceInds.toArray, sliceVals.toArray)
913  }
914
915  @Since("1.6.0")
916  override def toJson: String = {
917    val jValue = ("type" -> 0) ~
918      ("size" -> size) ~
919      ("indices" -> indices.toSeq) ~
920      ("values" -> values.toSeq)
921    compact(render(jValue))
922  }
923
924  @Since("2.0.0")
925  override def asML: newlinalg.SparseVector = {
926    new newlinalg.SparseVector(size, indices, values)
927  }
928}
929
930@Since("1.3.0")
931object SparseVector {
932  @Since("1.3.0")
933  def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] =
934    Some((sv.size, sv.indices, sv.values))
935
936  /**
937   * Convert new linalg type to spark.mllib type.  Light copy; only copies references
938   */
939  @Since("2.0.0")
940  def fromML(v: newlinalg.SparseVector): SparseVector = {
941    new SparseVector(v.size, v.indices, v.values)
942  }
943}
944
945/**
946 * Implicit methods available in Scala for converting [[org.apache.spark.mllib.linalg.Vector]] to
947 * [[org.apache.spark.ml.linalg.Vector]] and vice versa.
948 */
949private[spark] object VectorImplicits {
950
951  implicit def mllibVectorToMLVector(v: Vector): newlinalg.Vector = v.asML
952
953  implicit def mllibDenseVectorToMLDenseVector(v: DenseVector): newlinalg.DenseVector = v.asML
954
955  implicit def mllibSparseVectorToMLSparseVector(v: SparseVector): newlinalg.SparseVector = v.asML
956
957  implicit def mlVectorToMLlibVector(v: newlinalg.Vector): Vector = Vectors.fromML(v)
958
959  implicit def mlDenseVectorToMLlibDenseVector(v: newlinalg.DenseVector): DenseVector =
960    Vectors.fromML(v).asInstanceOf[DenseVector]
961
962  implicit def mlSparseVectorToMLlibSparseVector(v: newlinalg.SparseVector): SparseVector =
963    Vectors.fromML(v).asInstanceOf[SparseVector]
964}
965