[SPARK-28604][ML] Use log1p(x) over log(1+x) and expm1(x) over exp(x)-1 for accuracy

## What changes were proposed in this pull request?

Use `log1p(x)` over `log(1+x)` and `expm1(x)` over `exp(x)-1` for accuracy, where possible. This should improve accuracy a tiny bit in ML-related calculations, and shouldn't hurt in any event.

## How was this patch tested?

Existing tests.

Closes #25337 from srowen/SPARK-28604.

Authored-by: Sean Owen <sean.owen@databricks.com>
Signed-off-by: Sean Owen <sean.owen@databricks.com>
This commit is contained in:
Sean Owen 2019-08-04 17:04:01 -05:00
parent 4856c0e33a
commit c09675779b
11 changed files with 14 additions and 14 deletions

View file

@ -60,7 +60,7 @@ class CountMinSketchImpl extends CountMinSketch implements Serializable {
this.eps = eps;
this.confidence = confidence;
this.width = (int) Math.ceil(2 / eps);
this.depth = (int) Math.ceil(-Math.log(1 - confidence) / Math.log(2));
this.depth = (int) Math.ceil(-Math.log1p(-confidence) / Math.log(2));
initTablesWith(depth, width, seed);
}

View file

@ -287,10 +287,10 @@ class NaiveBayesModel private[ml] (
private lazy val (thetaMinusNegTheta, negThetaSum) = $(modelType) match {
case Multinomial => (None, None)
case Bernoulli =>
val negTheta = theta.map(value => math.log(1.0 - math.exp(value)))
val negTheta = theta.map(value => math.log1p(-math.exp(value)))
val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
val thetaMinusNegTheta = theta.map { value =>
value - math.log(1.0 - math.exp(value))
value - math.log1p(-math.exp(value))
}
(Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
case _ =>

View file

@ -342,7 +342,7 @@ class AFTSurvivalRegressionModel private[ml] (
// shape parameter for the Weibull distribution of lifetime
val k = 1 / scale
val quantiles = $(quantileProbabilities).map {
q => lambda * math.exp(math.log(-math.log(1 - q)) / k)
q => lambda * math.exp(math.log(-math.log1p(-q)) / k)
}
Vectors.dense(quantiles)
}

View file

@ -971,9 +971,9 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
private[regression] object CLogLog extends Link("cloglog") {
override def link(mu: Double): Double = math.log(-1.0 * math.log(1 - mu))
override def link(mu: Double): Double = math.log(-math.log1p(-mu))
override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log(1.0 - mu))
override def deriv(mu: Double): Double = 1.0 / ((mu - 1.0) * math.log1p(-mu))
override def unlink(eta: Double): Double = 1.0 - math.exp(-1.0 * math.exp(eta))
}

View file

@ -75,10 +75,10 @@ class NaiveBayesModel private[spark] (
private val (thetaMinusNegTheta, negThetaSum) = modelType match {
case Multinomial => (None, None)
case Bernoulli =>
val negTheta = thetaMatrix.map(value => math.log(1.0 - math.exp(value)))
val negTheta = thetaMatrix.map(value => math.log1p(-math.exp(value)))
val ones = new DenseVector(Array.fill(thetaMatrix.numCols) {1.0})
val thetaMinusNegTheta = thetaMatrix.map { value =>
value - math.log(1.0 - math.exp(value))
value - math.log1p(-math.exp(value))
}
(Option(thetaMinusNegTheta), Option(negTheta.multiply(ones)))
case _ =>

View file

@ -1344,7 +1344,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest {
b_k' = b_k - \mean(b_k)
}}}
*/
val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
val rawInterceptsTheory = histogram.map(math.log1p) // add 1 for smoothing
val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length
val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean))
val coefficientsTheory = new DenseMatrix(numClasses, numFeatures,

View file

@ -81,7 +81,7 @@ class NaiveBayesSuite extends MLTest with DefaultReadWriteTest {
}
def expectedBernoulliProbabilities(model: NaiveBayesModel, feature: Vector): Vector = {
val negThetaMatrix = model.theta.map(v => math.log(1.0 - math.exp(v)))
val negThetaMatrix = model.theta.map(v => math.log1p(-math.exp(v)))
val negFeature = Vectors.dense(feature.toArray.map(v => 1.0 - v))
val piTheta: BV[Double] = model.pi.asBreeze + model.theta.multiply(feature).asBreeze
val logClassProbs: BV[Double] = piTheta + negThetaMatrix.multiply(negFeature).asBreeze

View file

@ -213,7 +213,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
val lossSum = binaryInstances.map { case Instance(l, w, f) =>
val margin = BLAS.dot(Vectors.dense(stdCoef), f) + intercept
val prob = 1.0 / (1.0 + math.exp(-margin))
-w * l * math.log(prob) - w * (1.0 - l) * math.log(1.0 - prob)
-w * l * math.log(prob) - w * (1.0 - l) * math.log1p(-prob)
}.sum
val loss = lossSum / weightSum

View file

@ -233,7 +233,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
val piVector = new BDV(model.pi)
val thetaMatrix = new BDM(model.theta(0).length, model.theta.length, model.theta.flatten).t
val negThetaMatrix = new BDM(model.theta(0).length, model.theta.length,
model.theta.flatten.map(v => math.log(1.0 - math.exp(v)))).t
model.theta.flatten.map(v => math.log1p(-math.exp(v)))).t
val testBreeze = testData.asBreeze
val negTestBreeze = new BDV(Array.fill(testBreeze.size)(1.0)) - testBreeze
val piTheta: BV[Double] = piVector + (thetaMatrix * testBreeze)

View file

@ -50,7 +50,7 @@ object GradientDescentSuite {
val unifRand = new Random(45)
val rLogis = (0 until nPoints).map { i =>
val u = unifRand.nextDouble()
math.log(u) - math.log(1.0-u)
math.log(u) - math.log1p(-u)
}
val y: Seq[Int] = (0 until nPoints).map { i =>

View file

@ -89,7 +89,7 @@ class RandomDataGeneratorSuite extends SparkFunSuite {
val expectedMean = math.exp(mean + 0.5 * vari)
// variance of log normal = (e^var - 1) * e^(2 * mean + var)
val expectedStd = math.sqrt((math.exp(vari) - 1.0) * math.exp(2.0 * mean + vari))
val expectedStd = math.sqrt(math.expm1(vari) * math.exp(2.0 * mean + vari))
// since sampling error increases with variance, let's set
// the absolute tolerance as a percentage