1/* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18package org.apache.spark.mllib.linalg 19 20import java.lang.{Double => JavaDouble, Integer => JavaInteger, Iterable => JavaIterable} 21import java.util 22 23import scala.annotation.varargs 24import scala.collection.JavaConverters._ 25import scala.language.implicitConversions 26 27import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} 28import org.json4s.DefaultFormats 29import org.json4s.JsonDSL._ 30import org.json4s.jackson.JsonMethods.{compact, parse => parseJson, render} 31 32import org.apache.spark.SparkException 33import org.apache.spark.annotation.{AlphaComponent, Since} 34import org.apache.spark.ml.{linalg => newlinalg} 35import org.apache.spark.mllib.util.NumericParser 36import org.apache.spark.sql.catalyst.InternalRow 37import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} 38import org.apache.spark.sql.types._ 39 40/** 41 * Represents a numeric vector, whose index type is Int and value type is Double. 42 * 43 * @note Users should not implement this interface. 44 */ 45@SQLUserDefinedType(udt = classOf[VectorUDT]) 46@Since("1.0.0") 47sealed trait Vector extends Serializable { 48 49 /** 50 * Size of the vector. 51 */ 52 @Since("1.0.0") 53 def size: Int 54 55 /** 56 * Converts the instance to a double array. 57 */ 58 @Since("1.0.0") 59 def toArray: Array[Double] 60 61 override def equals(other: Any): Boolean = { 62 other match { 63 case v2: Vector => 64 if (this.size != v2.size) return false 65 (this, v2) match { 66 case (s1: SparseVector, s2: SparseVector) => 67 Vectors.equals(s1.indices, s1.values, s2.indices, s2.values) 68 case (s1: SparseVector, d1: DenseVector) => 69 Vectors.equals(s1.indices, s1.values, 0 until d1.size, d1.values) 70 case (d1: DenseVector, s1: SparseVector) => 71 Vectors.equals(0 until d1.size, d1.values, s1.indices, s1.values) 72 case (_, _) => util.Arrays.equals(this.toArray, v2.toArray) 73 } 74 case _ => false 75 } 76 } 77 78 /** 79 * Returns a hash code value for the vector. The hash code is based on its size and its first 128 80 * nonzero entries, using a hash algorithm similar to `java.util.Arrays.hashCode`. 81 */ 82 override def hashCode(): Int = { 83 // This is a reference implementation. It calls return in foreachActive, which is slow. 84 // Subclasses should override it with optimized implementation. 85 var result: Int = 31 + size 86 var nnz = 0 87 this.foreachActive { (index, value) => 88 if (nnz < Vectors.MAX_HASH_NNZ) { 89 // ignore explicit 0 for comparison between sparse and dense 90 if (value != 0) { 91 result = 31 * result + index 92 val bits = java.lang.Double.doubleToLongBits(value) 93 result = 31 * result + (bits ^ (bits >>> 32)).toInt 94 nnz += 1 95 } 96 } else { 97 return result 98 } 99 } 100 result 101 } 102 103 /** 104 * Converts the instance to a breeze vector. 105 */ 106 private[spark] def asBreeze: BV[Double] 107 108 /** 109 * Gets the value of the ith element. 110 * @param i index 111 */ 112 @Since("1.1.0") 113 def apply(i: Int): Double = asBreeze(i) 114 115 /** 116 * Makes a deep copy of this vector. 117 */ 118 @Since("1.1.0") 119 def copy: Vector = { 120 throw new NotImplementedError(s"copy is not implemented for ${this.getClass}.") 121 } 122 123 /** 124 * Applies a function `f` to all the active elements of dense and sparse vector. 125 * 126 * @param f the function takes two parameters where the first parameter is the index of 127 * the vector with type `Int`, and the second parameter is the corresponding value 128 * with type `Double`. 129 */ 130 @Since("1.6.0") 131 def foreachActive(f: (Int, Double) => Unit): Unit 132 133 /** 134 * Number of active entries. An "active entry" is an element which is explicitly stored, 135 * regardless of its value. 136 * 137 * @note Inactive entries have value 0. 138 */ 139 @Since("1.4.0") 140 def numActives: Int 141 142 /** 143 * Number of nonzero elements. This scans all active values and count nonzeros. 144 */ 145 @Since("1.4.0") 146 def numNonzeros: Int 147 148 /** 149 * Converts this vector to a sparse vector with all explicit zeros removed. 150 */ 151 @Since("1.4.0") 152 def toSparse: SparseVector 153 154 /** 155 * Converts this vector to a dense vector. 156 */ 157 @Since("1.4.0") 158 def toDense: DenseVector = new DenseVector(this.toArray) 159 160 /** 161 * Returns a vector in either dense or sparse format, whichever uses less storage. 162 */ 163 @Since("1.4.0") 164 def compressed: Vector = { 165 val nnz = numNonzeros 166 // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes. 167 if (1.5 * (nnz + 1.0) < size) { 168 toSparse 169 } else { 170 toDense 171 } 172 } 173 174 /** 175 * Find the index of a maximal element. Returns the first maximal element in case of a tie. 176 * Returns -1 if vector has length 0. 177 */ 178 @Since("1.5.0") 179 def argmax: Int 180 181 /** 182 * Converts the vector to a JSON string. 183 */ 184 @Since("1.6.0") 185 def toJson: String 186 187 /** 188 * Convert this vector to the new mllib-local representation. 189 * This does NOT copy the data; it copies references. 190 */ 191 @Since("2.0.0") 192 def asML: newlinalg.Vector 193} 194 195/** 196 * :: AlphaComponent :: 197 * 198 * User-defined type for [[Vector]] which allows easy interaction with SQL 199 * via [[org.apache.spark.sql.Dataset]]. 200 */ 201@AlphaComponent 202class VectorUDT extends UserDefinedType[Vector] { 203 204 override def sqlType: StructType = { 205 // type: 0 = sparse, 1 = dense 206 // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse 207 // vectors. The "values" field is nullable because we might want to add binary vectors later, 208 // which uses "size" and "indices", but not "values". 209 StructType(Seq( 210 StructField("type", ByteType, nullable = false), 211 StructField("size", IntegerType, nullable = true), 212 StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), 213 StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) 214 } 215 216 override def serialize(obj: Vector): InternalRow = { 217 obj match { 218 case SparseVector(size, indices, values) => 219 val row = new GenericInternalRow(4) 220 row.setByte(0, 0) 221 row.setInt(1, size) 222 row.update(2, UnsafeArrayData.fromPrimitiveArray(indices)) 223 row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) 224 row 225 case DenseVector(values) => 226 val row = new GenericInternalRow(4) 227 row.setByte(0, 1) 228 row.setNullAt(1) 229 row.setNullAt(2) 230 row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) 231 row 232 } 233 } 234 235 override def deserialize(datum: Any): Vector = { 236 datum match { 237 case row: InternalRow => 238 require(row.numFields == 4, 239 s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4") 240 val tpe = row.getByte(0) 241 tpe match { 242 case 0 => 243 val size = row.getInt(1) 244 val indices = row.getArray(2).toIntArray() 245 val values = row.getArray(3).toDoubleArray() 246 new SparseVector(size, indices, values) 247 case 1 => 248 val values = row.getArray(3).toDoubleArray() 249 new DenseVector(values) 250 } 251 } 252 } 253 254 override def pyUDT: String = "pyspark.mllib.linalg.VectorUDT" 255 256 override def userClass: Class[Vector] = classOf[Vector] 257 258 override def equals(o: Any): Boolean = { 259 o match { 260 case v: VectorUDT => true 261 case _ => false 262 } 263 } 264 265 // see [SPARK-8647], this achieves the needed constant hash code without constant no. 266 override def hashCode(): Int = classOf[VectorUDT].getName.hashCode() 267 268 override def typeName: String = "vector" 269 270 private[spark] override def asNullable: VectorUDT = this 271} 272 273/** 274 * Factory methods for [[org.apache.spark.mllib.linalg.Vector]]. 275 * We don't use the name `Vector` because Scala imports 276 * [[scala.collection.immutable.Vector]] by default. 277 */ 278@Since("1.0.0") 279object Vectors { 280 281 /** 282 * Creates a dense vector from its values. 283 */ 284 @Since("1.0.0") 285 @varargs 286 def dense(firstValue: Double, otherValues: Double*): Vector = 287 new DenseVector((firstValue +: otherValues).toArray) 288 289 // A dummy implicit is used to avoid signature collision with the one generated by @varargs. 290 /** 291 * Creates a dense vector from a double array. 292 */ 293 @Since("1.0.0") 294 def dense(values: Array[Double]): Vector = new DenseVector(values) 295 296 /** 297 * Creates a sparse vector providing its index array and value array. 298 * 299 * @param size vector size. 300 * @param indices index array, must be strictly increasing. 301 * @param values value array, must have the same length as indices. 302 */ 303 @Since("1.0.0") 304 def sparse(size: Int, indices: Array[Int], values: Array[Double]): Vector = 305 new SparseVector(size, indices, values) 306 307 /** 308 * Creates a sparse vector using unordered (index, value) pairs. 309 * 310 * @param size vector size. 311 * @param elements vector elements in (index, value) pairs. 312 */ 313 @Since("1.0.0") 314 def sparse(size: Int, elements: Seq[(Int, Double)]): Vector = { 315 require(size > 0, "The size of the requested sparse vector must be greater than 0.") 316 317 val (indices, values) = elements.sortBy(_._1).unzip 318 var prev = -1 319 indices.foreach { i => 320 require(prev < i, s"Found duplicate indices: $i.") 321 prev = i 322 } 323 require(prev < size, s"You may not write an element to index $prev because the declared " + 324 s"size of your vector is $size") 325 326 new SparseVector(size, indices.toArray, values.toArray) 327 } 328 329 /** 330 * Creates a sparse vector using unordered (index, value) pairs in a Java friendly way. 331 * 332 * @param size vector size. 333 * @param elements vector elements in (index, value) pairs. 334 */ 335 @Since("1.0.0") 336 def sparse(size: Int, elements: JavaIterable[(JavaInteger, JavaDouble)]): Vector = { 337 sparse(size, elements.asScala.map { case (i, x) => 338 (i.intValue(), x.doubleValue()) 339 }.toSeq) 340 } 341 342 /** 343 * Creates a vector of all zeros. 344 * 345 * @param size vector size 346 * @return a zero vector 347 */ 348 @Since("1.1.0") 349 def zeros(size: Int): Vector = { 350 new DenseVector(new Array[Double](size)) 351 } 352 353 /** 354 * Parses a string resulted from `Vector.toString` into a [[Vector]]. 355 */ 356 @Since("1.1.0") 357 def parse(s: String): Vector = { 358 parseNumeric(NumericParser.parse(s)) 359 } 360 361 /** 362 * Parses the JSON representation of a vector into a [[Vector]]. 363 */ 364 @Since("1.6.0") 365 def fromJson(json: String): Vector = { 366 implicit val formats = DefaultFormats 367 val jValue = parseJson(json) 368 (jValue \ "type").extract[Int] match { 369 case 0 => // sparse 370 val size = (jValue \ "size").extract[Int] 371 val indices = (jValue \ "indices").extract[Seq[Int]].toArray 372 val values = (jValue \ "values").extract[Seq[Double]].toArray 373 sparse(size, indices, values) 374 case 1 => // dense 375 val values = (jValue \ "values").extract[Seq[Double]].toArray 376 dense(values) 377 case _ => 378 throw new IllegalArgumentException(s"Cannot parse $json into a vector.") 379 } 380 } 381 382 private[mllib] def parseNumeric(any: Any): Vector = { 383 any match { 384 case values: Array[Double] => 385 Vectors.dense(values) 386 case Seq(size: Double, indices: Array[Double], values: Array[Double]) => 387 Vectors.sparse(size.toInt, indices.map(_.toInt), values) 388 case other => 389 throw new SparkException(s"Cannot parse $other.") 390 } 391 } 392 393 /** 394 * Creates a vector instance from a breeze vector. 395 */ 396 private[spark] def fromBreeze(breezeVector: BV[Double]): Vector = { 397 breezeVector match { 398 case v: BDV[Double] => 399 if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { 400 new DenseVector(v.data) 401 } else { 402 new DenseVector(v.toArray) // Can't use underlying array directly, so make a new one 403 } 404 case v: BSV[Double] => 405 if (v.index.length == v.used) { 406 new SparseVector(v.length, v.index, v.data) 407 } else { 408 new SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) 409 } 410 case v: BV[_] => 411 sys.error("Unsupported Breeze vector type: " + v.getClass.getName) 412 } 413 } 414 415 /** 416 * Returns the p-norm of this vector. 417 * @param vector input vector. 418 * @param p norm. 419 * @return norm in L^p^ space. 420 */ 421 @Since("1.3.0") 422 def norm(vector: Vector, p: Double): Double = { 423 require(p >= 1.0, "To compute the p-norm of the vector, we require that you specify a p>=1. " + 424 s"You specified p=$p.") 425 val values = vector match { 426 case DenseVector(vs) => vs 427 case SparseVector(n, ids, vs) => vs 428 case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) 429 } 430 val size = values.length 431 432 if (p == 1) { 433 var sum = 0.0 434 var i = 0 435 while (i < size) { 436 sum += math.abs(values(i)) 437 i += 1 438 } 439 sum 440 } else if (p == 2) { 441 var sum = 0.0 442 var i = 0 443 while (i < size) { 444 sum += values(i) * values(i) 445 i += 1 446 } 447 math.sqrt(sum) 448 } else if (p == Double.PositiveInfinity) { 449 var max = 0.0 450 var i = 0 451 while (i < size) { 452 val value = math.abs(values(i)) 453 if (value > max) max = value 454 i += 1 455 } 456 max 457 } else { 458 var sum = 0.0 459 var i = 0 460 while (i < size) { 461 sum += math.pow(math.abs(values(i)), p) 462 i += 1 463 } 464 math.pow(sum, 1.0 / p) 465 } 466 } 467 468 /** 469 * Returns the squared distance between two Vectors. 470 * @param v1 first Vector. 471 * @param v2 second Vector. 472 * @return squared distance between two Vectors. 473 */ 474 @Since("1.3.0") 475 def sqdist(v1: Vector, v2: Vector): Double = { 476 require(v1.size == v2.size, s"Vector dimensions do not match: Dim(v1)=${v1.size} and Dim(v2)" + 477 s"=${v2.size}.") 478 var squaredDistance = 0.0 479 (v1, v2) match { 480 case (v1: SparseVector, v2: SparseVector) => 481 val v1Values = v1.values 482 val v1Indices = v1.indices 483 val v2Values = v2.values 484 val v2Indices = v2.indices 485 val nnzv1 = v1Indices.length 486 val nnzv2 = v2Indices.length 487 488 var kv1 = 0 489 var kv2 = 0 490 while (kv1 < nnzv1 || kv2 < nnzv2) { 491 var score = 0.0 492 493 if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) { 494 score = v1Values(kv1) 495 kv1 += 1 496 } else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) { 497 score = v2Values(kv2) 498 kv2 += 1 499 } else { 500 score = v1Values(kv1) - v2Values(kv2) 501 kv1 += 1 502 kv2 += 1 503 } 504 squaredDistance += score * score 505 } 506 507 case (v1: SparseVector, v2: DenseVector) => 508 squaredDistance = sqdist(v1, v2) 509 510 case (v1: DenseVector, v2: SparseVector) => 511 squaredDistance = sqdist(v2, v1) 512 513 case (DenseVector(vv1), DenseVector(vv2)) => 514 var kv = 0 515 val sz = vv1.length 516 while (kv < sz) { 517 val score = vv1(kv) - vv2(kv) 518 squaredDistance += score * score 519 kv += 1 520 } 521 case _ => 522 throw new IllegalArgumentException("Do not support vector type " + v1.getClass + 523 " and " + v2.getClass) 524 } 525 squaredDistance 526 } 527 528 /** 529 * Returns the squared distance between DenseVector and SparseVector. 530 */ 531 private[mllib] def sqdist(v1: SparseVector, v2: DenseVector): Double = { 532 var kv1 = 0 533 var kv2 = 0 534 val indices = v1.indices 535 var squaredDistance = 0.0 536 val nnzv1 = indices.length 537 val nnzv2 = v2.size 538 var iv1 = if (nnzv1 > 0) indices(kv1) else -1 539 540 while (kv2 < nnzv2) { 541 var score = 0.0 542 if (kv2 != iv1) { 543 score = v2(kv2) 544 } else { 545 score = v1.values(kv1) - v2(kv2) 546 if (kv1 < nnzv1 - 1) { 547 kv1 += 1 548 iv1 = indices(kv1) 549 } 550 } 551 squaredDistance += score * score 552 kv2 += 1 553 } 554 squaredDistance 555 } 556 557 /** 558 * Check equality between sparse/dense vectors 559 */ 560 private[mllib] def equals( 561 v1Indices: IndexedSeq[Int], 562 v1Values: Array[Double], 563 v2Indices: IndexedSeq[Int], 564 v2Values: Array[Double]): Boolean = { 565 val v1Size = v1Values.length 566 val v2Size = v2Values.length 567 var k1 = 0 568 var k2 = 0 569 var allEqual = true 570 while (allEqual) { 571 while (k1 < v1Size && v1Values(k1) == 0) k1 += 1 572 while (k2 < v2Size && v2Values(k2) == 0) k2 += 1 573 574 if (k1 >= v1Size || k2 >= v2Size) { 575 return k1 >= v1Size && k2 >= v2Size // check end alignment 576 } 577 allEqual = v1Indices(k1) == v2Indices(k2) && v1Values(k1) == v2Values(k2) 578 k1 += 1 579 k2 += 1 580 } 581 allEqual 582 } 583 584 /** Max number of nonzero entries used in computing hash code. */ 585 private[linalg] val MAX_HASH_NNZ = 128 586 587 /** 588 * Convert new linalg type to spark.mllib type. Light copy; only copies references 589 */ 590 @Since("2.0.0") 591 def fromML(v: newlinalg.Vector): Vector = v match { 592 case dv: newlinalg.DenseVector => 593 DenseVector.fromML(dv) 594 case sv: newlinalg.SparseVector => 595 SparseVector.fromML(sv) 596 } 597} 598 599/** 600 * A dense vector represented by a value array. 601 */ 602@Since("1.0.0") 603@SQLUserDefinedType(udt = classOf[VectorUDT]) 604class DenseVector @Since("1.0.0") ( 605 @Since("1.0.0") val values: Array[Double]) extends Vector { 606 607 @Since("1.0.0") 608 override def size: Int = values.length 609 610 override def toString: String = values.mkString("[", ",", "]") 611 612 @Since("1.0.0") 613 override def toArray: Array[Double] = values 614 615 private[spark] override def asBreeze: BV[Double] = new BDV[Double](values) 616 617 @Since("1.0.0") 618 override def apply(i: Int): Double = values(i) 619 620 @Since("1.1.0") 621 override def copy: DenseVector = { 622 new DenseVector(values.clone()) 623 } 624 625 @Since("1.6.0") 626 override def foreachActive(f: (Int, Double) => Unit): Unit = { 627 var i = 0 628 val localValuesSize = values.length 629 val localValues = values 630 631 while (i < localValuesSize) { 632 f(i, localValues(i)) 633 i += 1 634 } 635 } 636 637 override def equals(other: Any): Boolean = super.equals(other) 638 639 override def hashCode(): Int = { 640 var result: Int = 31 + size 641 var i = 0 642 val end = values.length 643 var nnz = 0 644 while (i < end && nnz < Vectors.MAX_HASH_NNZ) { 645 val v = values(i) 646 if (v != 0.0) { 647 result = 31 * result + i 648 val bits = java.lang.Double.doubleToLongBits(values(i)) 649 result = 31 * result + (bits ^ (bits >>> 32)).toInt 650 nnz += 1 651 } 652 i += 1 653 } 654 result 655 } 656 657 @Since("1.4.0") 658 override def numActives: Int = size 659 660 @Since("1.4.0") 661 override def numNonzeros: Int = { 662 // same as values.count(_ != 0.0) but faster 663 var nnz = 0 664 values.foreach { v => 665 if (v != 0.0) { 666 nnz += 1 667 } 668 } 669 nnz 670 } 671 672 @Since("1.4.0") 673 override def toSparse: SparseVector = { 674 val nnz = numNonzeros 675 val ii = new Array[Int](nnz) 676 val vv = new Array[Double](nnz) 677 var k = 0 678 foreachActive { (i, v) => 679 if (v != 0) { 680 ii(k) = i 681 vv(k) = v 682 k += 1 683 } 684 } 685 new SparseVector(size, ii, vv) 686 } 687 688 @Since("1.5.0") 689 override def argmax: Int = { 690 if (size == 0) { 691 -1 692 } else { 693 var maxIdx = 0 694 var maxValue = values(0) 695 var i = 1 696 while (i < size) { 697 if (values(i) > maxValue) { 698 maxIdx = i 699 maxValue = values(i) 700 } 701 i += 1 702 } 703 maxIdx 704 } 705 } 706 707 @Since("1.6.0") 708 override def toJson: String = { 709 val jValue = ("type" -> 1) ~ ("values" -> values.toSeq) 710 compact(render(jValue)) 711 } 712 713 @Since("2.0.0") 714 override def asML: newlinalg.DenseVector = { 715 new newlinalg.DenseVector(values) 716 } 717} 718 719@Since("1.3.0") 720object DenseVector { 721 722 /** Extracts the value array from a dense vector. */ 723 @Since("1.3.0") 724 def unapply(dv: DenseVector): Option[Array[Double]] = Some(dv.values) 725 726 /** 727 * Convert new linalg type to spark.mllib type. Light copy; only copies references 728 */ 729 @Since("2.0.0") 730 def fromML(v: newlinalg.DenseVector): DenseVector = { 731 new DenseVector(v.values) 732 } 733} 734 735/** 736 * A sparse vector represented by an index array and a value array. 737 * 738 * @param size size of the vector. 739 * @param indices index array, assume to be strictly increasing. 740 * @param values value array, must have the same length as the index array. 741 */ 742@Since("1.0.0") 743@SQLUserDefinedType(udt = classOf[VectorUDT]) 744class SparseVector @Since("1.0.0") ( 745 @Since("1.0.0") override val size: Int, 746 @Since("1.0.0") val indices: Array[Int], 747 @Since("1.0.0") val values: Array[Double]) extends Vector { 748 749 require(indices.length == values.length, "Sparse vectors require that the dimension of the" + 750 s" indices match the dimension of the values. You provided ${indices.length} indices and " + 751 s" ${values.length} values.") 752 require(indices.length <= size, s"You provided ${indices.length} indices and values, " + 753 s"which exceeds the specified vector size ${size}.") 754 755 override def toString: String = 756 s"($size,${indices.mkString("[", ",", "]")},${values.mkString("[", ",", "]")})" 757 758 @Since("1.0.0") 759 override def toArray: Array[Double] = { 760 val data = new Array[Double](size) 761 var i = 0 762 val nnz = indices.length 763 while (i < nnz) { 764 data(indices(i)) = values(i) 765 i += 1 766 } 767 data 768 } 769 770 @Since("1.1.0") 771 override def copy: SparseVector = { 772 new SparseVector(size, indices.clone(), values.clone()) 773 } 774 775 private[spark] override def asBreeze: BV[Double] = new BSV[Double](indices, values, size) 776 777 @Since("1.6.0") 778 override def foreachActive(f: (Int, Double) => Unit): Unit = { 779 var i = 0 780 val localValuesSize = values.length 781 val localIndices = indices 782 val localValues = values 783 784 while (i < localValuesSize) { 785 f(localIndices(i), localValues(i)) 786 i += 1 787 } 788 } 789 790 override def equals(other: Any): Boolean = super.equals(other) 791 792 override def hashCode(): Int = { 793 var result: Int = 31 + size 794 val end = values.length 795 var k = 0 796 var nnz = 0 797 while (k < end && nnz < Vectors.MAX_HASH_NNZ) { 798 val v = values(k) 799 if (v != 0.0) { 800 val i = indices(k) 801 result = 31 * result + i 802 val bits = java.lang.Double.doubleToLongBits(v) 803 result = 31 * result + (bits ^ (bits >>> 32)).toInt 804 nnz += 1 805 } 806 k += 1 807 } 808 result 809 } 810 811 @Since("1.4.0") 812 override def numActives: Int = values.length 813 814 @Since("1.4.0") 815 override def numNonzeros: Int = { 816 var nnz = 0 817 values.foreach { v => 818 if (v != 0.0) { 819 nnz += 1 820 } 821 } 822 nnz 823 } 824 825 @Since("1.4.0") 826 override def toSparse: SparseVector = { 827 val nnz = numNonzeros 828 if (nnz == numActives) { 829 this 830 } else { 831 val ii = new Array[Int](nnz) 832 val vv = new Array[Double](nnz) 833 var k = 0 834 foreachActive { (i, v) => 835 if (v != 0.0) { 836 ii(k) = i 837 vv(k) = v 838 k += 1 839 } 840 } 841 new SparseVector(size, ii, vv) 842 } 843 } 844 845 @Since("1.5.0") 846 override def argmax: Int = { 847 if (size == 0) { 848 -1 849 } else { 850 // Find the max active entry. 851 var maxIdx = indices(0) 852 var maxValue = values(0) 853 var maxJ = 0 854 var j = 1 855 val na = numActives 856 while (j < na) { 857 val v = values(j) 858 if (v > maxValue) { 859 maxValue = v 860 maxIdx = indices(j) 861 maxJ = j 862 } 863 j += 1 864 } 865 866 // If the max active entry is nonpositive and there exists inactive ones, find the first zero. 867 if (maxValue <= 0.0 && na < size) { 868 if (maxValue == 0.0) { 869 // If there exists an inactive entry before maxIdx, find it and return its index. 870 if (maxJ < maxIdx) { 871 var k = 0 872 while (k < maxJ && indices(k) == k) { 873 k += 1 874 } 875 maxIdx = k 876 } 877 } else { 878 // If the max active value is negative, find and return the first inactive index. 879 var k = 0 880 while (k < na && indices(k) == k) { 881 k += 1 882 } 883 maxIdx = k 884 } 885 } 886 887 maxIdx 888 } 889 } 890 891 /** 892 * Create a slice of this vector based on the given indices. 893 * @param selectedIndices Unsorted list of indices into the vector. 894 * This does NOT do bound checking. 895 * @return New SparseVector with values in the order specified by the given indices. 896 * 897 * NOTE: The API needs to be discussed before making this public. 898 * Also, if we have a version assuming indices are sorted, we should optimize it. 899 */ 900 private[spark] def slice(selectedIndices: Array[Int]): SparseVector = { 901 var currentIdx = 0 902 val (sliceInds, sliceVals) = selectedIndices.flatMap { origIdx => 903 val iIdx = java.util.Arrays.binarySearch(this.indices, origIdx) 904 val i_v = if (iIdx >= 0) { 905 Iterator((currentIdx, this.values(iIdx))) 906 } else { 907 Iterator() 908 } 909 currentIdx += 1 910 i_v 911 }.unzip 912 new SparseVector(selectedIndices.length, sliceInds.toArray, sliceVals.toArray) 913 } 914 915 @Since("1.6.0") 916 override def toJson: String = { 917 val jValue = ("type" -> 0) ~ 918 ("size" -> size) ~ 919 ("indices" -> indices.toSeq) ~ 920 ("values" -> values.toSeq) 921 compact(render(jValue)) 922 } 923 924 @Since("2.0.0") 925 override def asML: newlinalg.SparseVector = { 926 new newlinalg.SparseVector(size, indices, values) 927 } 928} 929 930@Since("1.3.0") 931object SparseVector { 932 @Since("1.3.0") 933 def unapply(sv: SparseVector): Option[(Int, Array[Int], Array[Double])] = 934 Some((sv.size, sv.indices, sv.values)) 935 936 /** 937 * Convert new linalg type to spark.mllib type. Light copy; only copies references 938 */ 939 @Since("2.0.0") 940 def fromML(v: newlinalg.SparseVector): SparseVector = { 941 new SparseVector(v.size, v.indices, v.values) 942 } 943} 944 945/** 946 * Implicit methods available in Scala for converting [[org.apache.spark.mllib.linalg.Vector]] to 947 * [[org.apache.spark.ml.linalg.Vector]] and vice versa. 948 */ 949private[spark] object VectorImplicits { 950 951 implicit def mllibVectorToMLVector(v: Vector): newlinalg.Vector = v.asML 952 953 implicit def mllibDenseVectorToMLDenseVector(v: DenseVector): newlinalg.DenseVector = v.asML 954 955 implicit def mllibSparseVectorToMLSparseVector(v: SparseVector): newlinalg.SparseVector = v.asML 956 957 implicit def mlVectorToMLlibVector(v: newlinalg.Vector): Vector = Vectors.fromML(v) 958 959 implicit def mlDenseVectorToMLlibDenseVector(v: newlinalg.DenseVector): DenseVector = 960 Vectors.fromML(v).asInstanceOf[DenseVector] 961 962 implicit def mlSparseVectorToMLlibSparseVector(v: newlinalg.SparseVector): SparseVector = 963 Vectors.fromML(v).asInstanceOf[SparseVector] 964} 965