From c09675779b5669119ed225bd4eb69c9c12c9be51 Mon Sep 17 00:00:00 2001
From: Sean Owen <sean.owen@databricks.com>
Date: Sun, 4 Aug 2019 17:04:01 -0500
Subject: [PATCH] [SPARK-28604][ML] Use log1p(x) over log(1+x) and expm1(x)
 over exp(x)-1 for accuracy

## What changes were proposed in this pull request?

Use `log1p(x)` over `log(1+x)` and `expm1(x)` over `exp(x)-1` for accuracy, where possible. This should improve accuracy a tiny bit in ML-related calculations, and shouldn't hurt in any event.

## How was this patch tested?

Existing tests.

Closes #25337 from srowen/SPARK-28604.

Authored-by: Sean Owen <sean.owen@databricks.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
---
 .../java/org/apache/spark/util/sketch/CountMinSketchImpl.java | 2 +-
 .../scala/org/apache/spark/ml/classification/NaiveBayes.scala | 4 ++--
 .../apache/spark/ml/regression/AFTSurvivalRegression.scala    | 2 +-
 .../spark/ml/regression/GeneralizedLinearRegression.scala     | 4 ++--
 .../org/apache/spark/mllib/classification/NaiveBayes.scala    | 4 ++--
 .../spark/ml/classification/LogisticRegressionSuite.scala     | 2 +-
 .../org/apache/spark/ml/classification/NaiveBayesSuite.scala  | 2 +-
 .../spark/ml/optim/aggregator/LogisticAggregatorSuite.scala   | 2 +-
 .../apache/spark/mllib/classification/NaiveBayesSuite.scala   | 2 +-
 .../spark/mllib/optimization/GradientDescentSuite.scala       | 2 +-
 .../apache/spark/mllib/random/RandomDataGeneratorSuite.scala  | 2 +-
 11 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
index b78c1677a1..f6c1c39bbf 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
@@ -60,7 +60,7 @@ class CountMinSketchImpl extends CountMinSketch implements Serializable {
     this.eps = eps;
     this.confidence = confidence;
     this.width = (int) Math.ceil(2 / eps);
-    this.depth = (int) Math.ceil(-Math.log(1 - confidence) / Math.log(2));
+    this.depth = (int) Math.ceil(-Math.log1p(-confidence) / Math.log(2));
     initTablesWith(depth, width, seed);
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 1a7a5e7a52..e97af0582d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -287,10 +287,10 @@ class NaiveBayesModel private[ml] (
   private lazy val (thetaMinusNegTheta, negThetaSum) = $(modelType) match {
     case Multinomial => (None, None)
     case Bernoulli =>
-      val negTheta = theta.map(value => math.log(1.0 - math.exp(value)))
+      val negTheta = theta.map(value => math.log1p(-math.exp(value)))
       val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
       val thetaMinusNegTheta = theta.map { value =>
-        value - math.log(1.0 - math.exp(value))
+        value - math.log1p(-math.exp(value))
       }
       (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
     case _ =>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 1565782dd6..a65592f0e7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -342,7 +342,7 @@ class AFTSurvivalRegressionModel private[ml] (
     // shape parameter for the Weibull distribution of lifetime
     val k = 1 / scale
     val quantiles = $(quantileProbabilities).map {
-      q => lambda * math.exp(math.log(-math.log(1 - q)) / k)
+      q => lambda * math.exp(math.log(-math.log1p(-q)) / k)
     }
     Vectors.dense(quantiles)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index b1a8f95c12..a8f4ed9096 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -971,9 +971,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
 
   private[regression] object CLogLog extends Link("cloglog") {
 
-    override def link(mu: Double): Double = math.log(-1.0 * math.log(1 - mu))
+    override def link(mu: Double): Double = math.log(-math.log1p(-mu))
 
-    override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log(1.0 - mu))
+    override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log1p(-mu))
 
     override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta))
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
index 79bb4adac8..19df156d83 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@@ -75,10 +75,10 @@ class NaiveBayesModel private[spark] (
   private val (thetaMinusNegTheta, negThetaSum) = modelType match {
     case Multinomial => (None, None)
     case Bernoulli =>
-      val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
+      val negTheta = thetaMatrix.map(value => math.log1p(-math.exp(value)))
       val ones = new DenseVector(Array.fill(thetaMatrix.numCols) {1.0})
       val thetaMinusNegTheta = thetaMatrix.map { value =>
-        value - math.log(1.0 - math.exp(value))
+        value - math.log1p(-math.exp(value))
       }
       (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
     case _ =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 6dea4b1903..2b5a9a396e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -1344,7 +1344,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
          b_k' = b_k - \mean(b_k)
        }}}
      */
-    val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
+    val rawInterceptsTheory = histogram.map(math.log1p) // add 1 for smoothing
     val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length
     val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean))
     val coefficientsTheory = new DenseMatrix(numClasses, numFeatures,
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index a8c4f091b2..9100ef1db6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -81,7 +81,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
   }
 
   def expectedBernoulliProbabilities(model: NaiveBayesModel, feature: Vector): Vector = {
-    val negThetaMatrix = model.theta.map(v => math.log(1.0 - math.exp(v)))
+    val negThetaMatrix = model.theta.map(v => math.log1p(-math.exp(v)))
     val negFeature = Vectors.dense(feature.toArray.map(v => 1.0 - v))
     val piTheta: BV[Double] = model.pi.asBreeze + model.theta.multiply(feature).asBreeze
     val logClassProbs: BV[Double] = piTheta + negThetaMatrix.multiply(negFeature).asBreeze
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
index 4c7913d5d2..e699adcc14 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
@@ -213,7 +213,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
     val lossSum = binaryInstances.map { case Instance(l, w, f) =>
       val margin = BLAS.dot(Vectors.dense(stdCoef), f) + intercept
       val prob = 1.0 / (1.0 + math.exp(-margin))
-      -w * l * math.log(prob) - w * (1.0 - l) * math.log(1.0 - prob)
+      -w * l * math.log(prob) - w * (1.0 - l) * math.log1p(-prob)
     }.sum
     val loss = lossSum / weightSum
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
index 8c7d583923..725389813b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@@ -233,7 +233,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
     val piVector = new BDV(model.pi)
     val thetaMatrix = new BDM(model.theta(0).length, model.theta.length, model.theta.flatten).t
     val negThetaMatrix = new BDM(model.theta(0).length, model.theta.length,
-      model.theta.flatten.map(v => math.log(1.0 - math.exp(v)))).t
+      model.theta.flatten.map(v => math.log1p(-math.exp(v)))).t
     val testBreeze = testData.asBreeze
     val negTestBreeze = new BDV(Array.fill(testBreeze.size)(1.0)) - testBreeze
     val piTheta: BV[Double] = piVector + (thetaMatrix * testBreeze)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
index 6250b0363e..a5542565c7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
@@ -50,7 +50,7 @@ object GradientDescentSuite {
     val unifRand = new Random(45)
     val rLogis = (0 until nPoints).map { i =>
       val u = unifRand.nextDouble()
-      math.log(u) - math.log(1.0-u)
+      math.log(u) - math.log1p(-u)
     }
 
     val y: Seq[Int] = (0 until nPoints).map { i =>
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
index 8011026e6f..b3bf5a2a8f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@@ -89,7 +89,7 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
         val expectedMean = math.exp(mean + 0.5 * vari)
 
         // variance of log normal = (e^var - 1) * e^(2 * mean + var)
-        val expectedStd = math.sqrt((math.exp(vari) - 1.0) * math.exp(2.0 * mean + vari))
+        val expectedStd = math.sqrt(math.expm1(vari) * math.exp(2.0 * mean + vari))
 
         // since sampling error increases with variance, let's set
         // the absolute tolerance as a percentage