[SPARK-4797] Replace breezeSquaredDistance

This PR replaces slow breezeSquaredDistance.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #3643 from viirya/faster_squareddistance and squashes the following commits:

f28b275 [Liang-Chi Hsieh] Move the implementation to linalg.Vectors and rename as sqdist.
0bc48ee [Liang-Chi Hsieh] Merge branch 'master' into faster_squareddistance
ba34422 [Liang-Chi Hsieh] Fix bug.
91849d0 [Liang-Chi Hsieh] Modified for comment.
44a65ad [Liang-Chi Hsieh] Modified for comments.
35db395 [Liang-Chi Hsieh] Fix bug and some modifications for comments.
f4f5ebb [Liang-Chi Hsieh] Follow BLAS.dot pattern to replace intersect, diff with while-loop.
a36e09f [Liang-Chi Hsieh] Use while-loop to replace foreach for better performance.
d3e0628 [Liang-Chi Hsieh] Make the methods private.
dd415bc [Liang-Chi Hsieh] Consider different cases of SparseVector and DenseVector.
13669db [Liang-Chi Hsieh] Replace breezeSquaredDistance.
This commit is contained in:
Liang-Chi Hsieh 2014-12-31 11:50:53 -08:00 committed by Xiangrui Meng
parent 352ed6bbe3
commit 06a9aa589c
3 changed files with 100 additions and 8 deletions

View file

@ -312,6 +312,86 @@ object Vectors {
math.pow(sum, 1.0 / p)
}
}
/**
* Returns the squared distance between two Vectors.
* @param v1 first Vector.
* @param v2 second Vector.
* @return squared distance between two Vectors.
*/
def sqdist(v1: Vector, v2: Vector): Double = {
var squaredDistance = 0.0
(v1, v2) match {
case (v1: SparseVector, v2: SparseVector) =>
val v1Values = v1.values
val v1Indices = v1.indices
val v2Values = v2.values
val v2Indices = v2.indices
val nnzv1 = v1Indices.size
val nnzv2 = v2Indices.size
var kv1 = 0
var kv2 = 0
while (kv1 < nnzv1 || kv2 < nnzv2) {
var score = 0.0
if (kv2 >= nnzv2 || (kv1 < nnzv1 && v1Indices(kv1) < v2Indices(kv2))) {
score = v1Values(kv1)
kv1 += 1
} else if (kv1 >= nnzv1 || (kv2 < nnzv2 && v2Indices(kv2) < v1Indices(kv1))) {
score = v2Values(kv2)
kv2 += 1
} else {
score = v1Values(kv1) - v2Values(kv2)
kv1 += 1
kv2 += 1
}
squaredDistance += score * score
}
case (v1: SparseVector, v2: DenseVector) if v1.indices.length / v1.size < 0.5 =>
squaredDistance = sqdist(v1, v2)
case (v1: DenseVector, v2: SparseVector) if v2.indices.length / v2.size < 0.5 =>
squaredDistance = sqdist(v2, v1)
// When a SparseVector is approximately dense, we treat it as a DenseVector
case (v1, v2) =>
squaredDistance = v1.toArray.zip(v2.toArray).foldLeft(0.0){ (distance, elems) =>
val score = elems._1 - elems._2
distance + score * score
}
}
squaredDistance
}
/**
* Returns the squared distance between DenseVector and SparseVector.
*/
private[mllib] def sqdist(v1: SparseVector, v2: DenseVector): Double = {
var kv1 = 0
var kv2 = 0
val indices = v1.indices
var squaredDistance = 0.0
var iv1 = indices(kv1)
val nnzv2 = v2.size
while (kv2 < nnzv2) {
var score = 0.0
if (kv2 != iv1) {
score = v2(kv2)
} else {
score = v1.values(kv1) - v2(kv2)
if (kv1 < indices.length - 1) {
kv1 += 1
iv1 = indices(kv1)
}
}
squaredDistance += score * score
kv2 += 1
}
squaredDistance
}
}
/**

View file

@ -19,8 +19,7 @@ package org.apache.spark.mllib.util
import scala.reflect.ClassTag
import breeze.linalg.{DenseVector => BDV, SparseVector => BSV,
squaredDistance => breezeSquaredDistance}
import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}
import org.apache.spark.annotation.Experimental
import org.apache.spark.SparkContext
@ -28,7 +27,7 @@ import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.PartitionwiseSampledRDD
import org.apache.spark.util.random.BernoulliCellSampler
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, Vectors}
import org.apache.spark.mllib.linalg.BLAS.dot
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
@ -316,12 +315,10 @@ object MLUtils {
val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) /
(sqDist + EPSILON)
if (precisionBound2 > precision) {
// TODO: breezeSquaredDistance is slow,
// so we should replace it with our own implementation.
sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
sqDist = Vectors.sqdist(v1, v2)
}
} else {
sqDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
sqDist = Vectors.sqdist(v1, v2)
}
sqDist
}

View file

@ -52,12 +52,27 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext {
val values = indices.map(i => a(i))
val v2 = Vectors.sparse(n, indices, values)
val norm2 = Vectors.norm(v2, 2.0)
val v3 = Vectors.sparse(n, indices, indices.map(i => a(i) + 0.5))
val norm3 = Vectors.norm(v3, 2.0)
val squaredDist = breezeSquaredDistance(v1.toBreeze, v2.toBreeze)
val fastSquaredDist1 = fastSquaredDistance(v1, norm1, v2, norm2, precision)
assert((fastSquaredDist1 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
val fastSquaredDist2 =
fastSquaredDistance(v1, norm1, Vectors.dense(v2.toArray), norm2, precision)
assert((fastSquaredDist2 - squaredDist) <= precision * squaredDist, s"failed with m = $m")
val squaredDist2 = breezeSquaredDistance(v2.toBreeze, v3.toBreeze)
val fastSquaredDist3 =
fastSquaredDistance(v2, norm2, v3, norm3, precision)
assert((fastSquaredDist3 - squaredDist2) <= precision * squaredDist2, s"failed with m = $m")
if (m > 10) {
val v4 = Vectors.sparse(n, indices.slice(0, m - 10),
indices.map(i => a(i) + 0.5).slice(0, m - 10))
val norm4 = Vectors.norm(v4, 2.0)
val squaredDist = breezeSquaredDistance(v2.toBreeze, v4.toBreeze)
val fastSquaredDist =
fastSquaredDistance(v2, norm2, v4, norm4, precision)
assert((fastSquaredDist - squaredDist) <= precision * squaredDist, s"failed with m = $m")
}
}
}