From d85cd4eb1479f8d37dab360530dc2c71216b4a8d Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sun, 1 Feb 2015 19:40:26 -0800 Subject: [PATCH] [Spark-5406][MLlib] LocalLAPACK mode in RowMatrix.computeSVD should have much smaller upper bound JIRA link: https://issues.apache.org/jira/browse/SPARK-5406 The code in breeze svd imposes the upper bound for LocalLAPACK in RowMatrix.computeSVD code from breeze svd (https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/linalg/functions/svd.scala) val workSize = ( 3 * scala.math.min(m, n) * scala.math.min(m, n) + scala.math.max(scala.math.max(m, n), 4 * scala.math.min(m, n) * scala.math.min(m, n) + 4 * scala.math.min(m, n)) ) val work = new Array[Double](workSize) As a result, 7 * n * n + 4 * n < Int.MaxValue at least (depends on JVM) In some worse cases, like n = 25000, work size will become positive again (80032704) and bring wired behavior. The PR is only the beginning, to support Genbase ( an important biological benchmark that would help promote Spark to genetic applications, http://www.paradigm4.com/wp-content/uploads/2014/06/Genomics-Benchmark-Technical-Report.pdf), which needs to compute svd for matrix up to 60K * 70K. I found many potential issues and would like to know if there's any plan undergoing that would expand the range of matrix computation based on Spark. Thanks. Author: Yuhao Yang Closes #4200 from hhbyyh/rowMatrix and squashes the following commits: f7864d0 [Yuhao Yang] update auto logic for rowMatrix svd 23860e4 [Yuhao Yang] fix comment style e48a6e4 [Yuhao Yang] make latent svd computation constraint clear --- .../apache/spark/mllib/linalg/distributed/RowMatrix.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 53b7970470..961111507f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -219,8 +219,12 @@ class RowMatrix( val computeMode = mode match { case "auto" => + if(k > 5000) { + logWarning(s"computing svd with k=$k and n=$n, please check necessity") + } + // TODO: The conditions below are not fully tested. - if (n < 100 || k > n / 2) { + if (n < 100 || (k > n / 2 && n <= 15000)) { // If n is small or k is large compared with n, we better compute the Gramian matrix first // and then compute its eigenvalues locally, instead of making multiple passes. if (k < n / 3) { @@ -245,6 +249,8 @@ class RowMatrix( val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]] EigenValueDecomposition.symmetricEigs(v => G * v, n, k, tol, maxIter) case SVDMode.LocalLAPACK => + // breeze (v0.10) svd latent constraint, 7 * n * n + 4 * n < Int.MaxValue + require(n < 17515, s"$n exceeds the breeze svd capability") val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]] val brzSvd.SVD(uFull: BDM[Double], sigmaSquaresFull: BDV[Double], _) = brzSvd(G) (sigmaSquaresFull, uFull)