From c09675779b5669119ed225bd4eb69c9c12c9be51 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sun, 4 Aug 2019 17:04:01 -0500 Subject: [PATCH] [SPARK-28604][ML] Use log1p(x) over log(1+x) and expm1(x) over exp(x)-1 for accuracy ## What changes were proposed in this pull request? Use `log1p(x)` over `log(1+x)` and `expm1(x)` over `exp(x)-1` for accuracy, where possible. This should improve accuracy a tiny bit in ML-related calculations, and shouldn't hurt in any event. ## How was this patch tested? Existing tests. Closes #25337 from srowen/SPARK-28604. Authored-by: Sean Owen Signed-off-by: Sean Owen --- .../java/org/apache/spark/util/sketch/CountMinSketchImpl.java | 2 +- .../scala/org/apache/spark/ml/classification/NaiveBayes.scala | 4 ++-- .../apache/spark/ml/regression/AFTSurvivalRegression.scala | 2 +- .../spark/ml/regression/GeneralizedLinearRegression.scala | 4 ++-- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 4 ++-- .../spark/ml/classification/LogisticRegressionSuite.scala | 2 +- .../org/apache/spark/ml/classification/NaiveBayesSuite.scala | 2 +- .../spark/ml/optim/aggregator/LogisticAggregatorSuite.scala | 2 +- .../apache/spark/mllib/classification/NaiveBayesSuite.scala | 2 +- .../spark/mllib/optimization/GradientDescentSuite.scala | 2 +- .../apache/spark/mllib/random/RandomDataGeneratorSuite.scala | 2 +- 11 files changed, 14 insertions(+), 14 deletions(-) diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java index b78c1677a1..f6c1c39bbf 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java @@ -60,7 +60,7 @@ class CountMinSketchImpl extends CountMinSketch implements Serializable { this.eps = eps; this.confidence = confidence; this.width = (int) Math.ceil(2 / eps); - this.depth = (int) Math.ceil(-Math.log(1 - confidence) / Math.log(2)); + this.depth = (int) Math.ceil(-Math.log1p(-confidence) / Math.log(2)); initTablesWith(depth, width, seed); } diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 1a7a5e7a52..e97af0582d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -287,10 +287,10 @@ class NaiveBayesModel private[ml] ( private lazy val (thetaMinusNegTheta, negThetaSum) = $(modelType) match { case Multinomial => (None, None) case Bernoulli => - val negTheta = theta.map(value => math.log(1.0 - math.exp(value))) + val negTheta = theta.map(value => math.log1p(-math.exp(value))) val ones = new DenseVector(Array.fill(theta.numCols) {1.0}) val thetaMinusNegTheta = theta.map { value => - value - math.log(1.0 - math.exp(value)) + value - math.log1p(-math.exp(value)) } (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones))) case _ => diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 1565782dd6..a65592f0e7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -342,7 +342,7 @@ class AFTSurvivalRegressionModel private[ml] ( // shape parameter for the Weibull distribution of lifetime val k = 1 / scale val quantiles = $(quantileProbabilities).map { - q => lambda * math.exp(math.log(-math.log(1 - q)) / k) + q => lambda * math.exp(math.log(-math.log1p(-q)) / k) } Vectors.dense(quantiles) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index b1a8f95c12..a8f4ed9096 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -971,9 +971,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine private[regression] object CLogLog extends Link("cloglog") { - override def link(mu: Double): Double = math.log(-1.0 * math.log(1 - mu)) + override def link(mu: Double): Double = math.log(-math.log1p(-mu)) - override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log(1.0 - mu)) + override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log1p(-mu)) override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 79bb4adac8..19df156d83 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -75,10 +75,10 @@ class NaiveBayesModel private[spark] ( private val (thetaMinusNegTheta, negThetaSum) = modelType match { case Multinomial => (None, None) case Bernoulli => - val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value))) + val negTheta = thetaMatrix.map(value => math.log1p(-math.exp(value))) val ones = new DenseVector(Array.fill(thetaMatrix.numCols) {1.0}) val thetaMinusNegTheta = thetaMatrix.map { value => - value - math.log(1.0 - math.exp(value)) + value - math.log1p(-math.exp(value)) } (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones))) case _ => diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 6dea4b1903..2b5a9a396e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -1344,7 +1344,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { b_k' = b_k - \mean(b_k) }}} */ - val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing + val rawInterceptsTheory = histogram.map(math.log1p) // add 1 for smoothing val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean)) val coefficientsTheory = new DenseMatrix(numClasses, numFeatures, diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala index a8c4f091b2..9100ef1db6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala @@ -81,7 +81,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest { } def expectedBernoulliProbabilities(model: NaiveBayesModel, feature: Vector): Vector = { - val negThetaMatrix = model.theta.map(v => math.log(1.0 - math.exp(v))) + val negThetaMatrix = model.theta.map(v => math.log1p(-math.exp(v))) val negFeature = Vectors.dense(feature.toArray.map(v => 1.0 - v)) val piTheta: BV[Double] = model.pi.asBreeze + model.theta.multiply(feature).asBreeze val logClassProbs: BV[Double] = piTheta + negThetaMatrix.multiply(negFeature).asBreeze diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala index 4c7913d5d2..e699adcc14 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala @@ -213,7 +213,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { val lossSum = binaryInstances.map { case Instance(l, w, f) => val margin = BLAS.dot(Vectors.dense(stdCoef), f) + intercept val prob = 1.0 / (1.0 + math.exp(-margin)) - -w * l * math.log(prob) - w * (1.0 - l) * math.log(1.0 - prob) + -w * l * math.log(prob) - w * (1.0 - l) * math.log1p(-prob) }.sum val loss = lossSum / weightSum diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 8c7d583923..725389813b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -233,7 +233,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext { val piVector = new BDV(model.pi) val thetaMatrix = new BDM(model.theta(0).length, model.theta.length, model.theta.flatten).t val negThetaMatrix = new BDM(model.theta(0).length, model.theta.length, - model.theta.flatten.map(v => math.log(1.0 - math.exp(v)))).t + model.theta.flatten.map(v => math.log1p(-math.exp(v)))).t val testBreeze = testData.asBreeze val negTestBreeze = new BDV(Array.fill(testBreeze.size)(1.0)) - testBreeze val piTheta: BV[Double] = piVector + (thetaMatrix * testBreeze) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala index 6250b0363e..a5542565c7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala @@ -50,7 +50,7 @@ object GradientDescentSuite { val unifRand = new Random(45) val rLogis = (0 until nPoints).map { i => val u = unifRand.nextDouble() - math.log(u) - math.log(1.0-u) + math.log(u) - math.log1p(-u) } val y: Seq[Int] = (0 until nPoints).map { i => diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala index 8011026e6f..b3bf5a2a8f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala @@ -89,7 +89,7 @@ class RandomDataGeneratorSuite extends SparkFunSuite { val expectedMean = math.exp(mean + 0.5 * vari) // variance of log normal = (e^var - 1) * e^(2 * mean + var) - val expectedStd = math.sqrt((math.exp(vari) - 1.0) * math.exp(2.0 * mean + vari)) + val expectedStd = math.sqrt(math.expm1(vari) * math.exp(2.0 * mean + vari)) // since sampling error increases with variance, let's set // the absolute tolerance as a percentage