[SPARK-28604][ML] Use log1p(x) over log(1+x) and expm1(x) over exp(x)-1 for accuracy

## What changes were proposed in this pull request? Use `log1p(x)` over `log(1+x)` and `expm1(x)` over `exp(x)-1` for accuracy, where possible. This should improve accuracy a tiny bit in ML-related calculations, and shouldn't hurt in any event. ## How was this patch tested? Existing tests. Closes #25337 from srowen/SPARK-28604. Authored-by: Sean Owen <sean.owen@databricks.com> Signed-off-by: Sean Owen <sean.owen@databricks.com>
2019-08-04 17:04:01 -05:00 · 2019-08-04 17:04:01 -05:00 · c09675779b
parent 4856c0e33a
commit c09675779b
11 changed files with 14 additions and 14 deletions
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/CountMinSketchImpl.java
@ -60,7 +60,7 @@ class CountMinSketchImpl extends CountMinSketch implements Serializable {
    this.eps = eps;
    this.confidence = confidence;
    this.width = (int) Math.ceil(2 / eps);
-    this.depth = (int) Math.ceil(-Math.log(1 - confidence) / Math.log(2));
+    this.depth = (int) Math.ceil(-Math.log1p(-confidence) / Math.log(2));
    initTablesWith(depth, width, seed);
  }

--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@ -287,10 +287,10 @@ class NaiveBayesModel private[ml] (
  private lazy val (thetaMinusNegTheta, negThetaSum) = $(modelType) match {
    case Multinomial => (None, None)
    case Bernoulli =>
-      val negTheta = theta.map(value => math.log(1.0 - math.exp(value)))
+      val negTheta = theta.map(value => math.log1p(-math.exp(value)))
      val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
      val thetaMinusNegTheta = theta.map { value =>
-        value - math.log(1.0 - math.exp(value))
+        value - math.log1p(-math.exp(value))
      }
      (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
    case _ =>
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@ -342,7 +342,7 @@ class AFTSurvivalRegressionModel private[ml] (
    // shape parameter for the Weibull distribution of lifetime
    val k = 1 / scale
    val quantiles = $(quantileProbabilities).map {
-      q => lambda * math.exp(math.log(-math.log(1 - q)) / k)
+      q => lambda * math.exp(math.log(-math.log1p(-q)) / k)
    }
    Vectors.dense(quantiles)
  }
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@ -971,9 +971,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine

  private[regression] object CLogLog extends Link("cloglog") {

-    override def link(mu: Double): Double = math.log(-1.0 * math.log(1 - mu))
+    override def link(mu: Double): Double = math.log(-math.log1p(-mu))

-    override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log(1.0 - mu))
+    override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log1p(-mu))

    override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta))
  }
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala
@ -75,10 +75,10 @@ class NaiveBayesModel private[spark] (
  private val (thetaMinusNegTheta, negThetaSum) = modelType match {
    case Multinomial => (None, None)
    case Bernoulli =>
-      val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
+      val negTheta = thetaMatrix.map(value => math.log1p(-math.exp(value)))
      val ones = new DenseVector(Array.fill(thetaMatrix.numCols) {1.0})
      val thetaMinusNegTheta = thetaMatrix.map { value =>
-        value - math.log(1.0 - math.exp(value))
+        value - math.log1p(-math.exp(value))
      }
      (Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
    case _ =>
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@ -1344,7 +1344,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
         b_k' = b_k - \mean(b_k)
       }}}
     */
-    val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
+    val rawInterceptsTheory = histogram.map(math.log1p) // add 1 for smoothing
    val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length
    val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean))
    val coefficientsTheory = new DenseMatrix(numClasses, numFeatures,
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@ -81,7 +81,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
  }

  def expectedBernoulliProbabilities(model: NaiveBayesModel, feature: Vector): Vector = {
-    val negThetaMatrix = model.theta.map(v => math.log(1.0 - math.exp(v)))
+    val negThetaMatrix = model.theta.map(v => math.log1p(-math.exp(v)))
    val negFeature = Vectors.dense(feature.toArray.map(v => 1.0 - v))
    val piTheta: BV[Double] = model.pi.asBreeze + model.theta.multiply(feature).asBreeze
    val logClassProbs: BV[Double] = piTheta + negThetaMatrix.multiply(negFeature).asBreeze
--- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala
@ -213,7 +213,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
    val lossSum = binaryInstances.map { case Instance(l, w, f) =>
      val margin = BLAS.dot(Vectors.dense(stdCoef), f) + intercept
      val prob = 1.0 / (1.0 + math.exp(-margin))
-      -w * l * math.log(prob) - w * (1.0 - l) * math.log(1.0 - prob)
+      -w * l * math.log(prob) - w * (1.0 - l) * math.log1p(-prob)
    }.sum
    val loss = lossSum / weightSum

--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala
@ -233,7 +233,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
    val piVector = new BDV(model.pi)
    val thetaMatrix = new BDM(model.theta(0).length, model.theta.length, model.theta.flatten).t
    val negThetaMatrix = new BDM(model.theta(0).length, model.theta.length,
-      model.theta.flatten.map(v => math.log(1.0 - math.exp(v)))).t
+      model.theta.flatten.map(v => math.log1p(-math.exp(v)))).t
    val testBreeze = testData.asBreeze
    val negTestBreeze = new BDV(Array.fill(testBreeze.size)(1.0)) - testBreeze
    val piTheta: BV[Double] = piVector + (thetaMatrix * testBreeze)
--- a/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/optimization/GradientDescentSuite.scala
@ -50,7 +50,7 @@ object GradientDescentSuite {
    val unifRand = new Random(45)
    val rLogis = (0 until nPoints).map { i =>
      val u = unifRand.nextDouble()
-      math.log(u) - math.log(1.0-u)
+      math.log(u) - math.log1p(-u)
    }

    val y: Seq[Int] = (0 until nPoints).map { i =>
--- a/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/random/RandomDataGeneratorSuite.scala
@ -89,7 +89,7 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
        val expectedMean = math.exp(mean + 0.5 * vari)

        // variance of log normal = (e^var - 1) * e^(2 * mean + var)
-        val expectedStd = math.sqrt((math.exp(vari) - 1.0) * math.exp(2.0 * mean + vari))
+        val expectedStd = math.sqrt(math.expm1(vari) * math.exp(2.0 * mean + vari))

        // since sampling error increases with variance, let's set
        // the absolute tolerance as a percentage