[SPARK-8736] [ML] GBTRegressor should not threshold prediction
Changed GBTRegressor so it does NOT threshold the prediction. Added test which fails with bug but works after fix. CC: feynmanliang mengxr Author: Joseph K. Bradley <joseph@databricks.com> Closes #7134 from jkbradley/gbrt-fix and squashes the following commits: 613b90e [Joseph K. Bradley] Changed GBTRegressor so it does NOT threshold the prediction
This commit is contained in:
parent
4bb8375fc2
commit
3ba23ffd37
|
@ -172,8 +172,7 @@ final class GBTRegressionModel(
|
|||
// TODO: When we add a generic Boosting class, handle transform there? SPARK-7129
|
||||
// Classifies by thresholding sum of weighted tree predictions
|
||||
val treePredictions = _trees.map(_.rootNode.predict(features))
|
||||
val prediction = blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
|
||||
if (prediction > 0.0) 1.0 else 0.0
|
||||
blas.ddot(numTrees, treePredictions, 1, _treeWeights, 1)
|
||||
}
|
||||
|
||||
override def copy(extra: ParamMap): GBTRegressionModel = {
|
||||
|
|
|
@ -19,12 +19,13 @@ package org.apache.spark.ml.regression
|
|||
|
||||
import org.apache.spark.SparkFunSuite
|
||||
import org.apache.spark.ml.impl.TreeTests
|
||||
import org.apache.spark.mllib.linalg.Vectors
|
||||
import org.apache.spark.mllib.regression.LabeledPoint
|
||||
import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT}
|
||||
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
|
||||
import org.apache.spark.mllib.util.MLlibTestSparkContext
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.{DataFrame, Row}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -67,6 +68,26 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
}
|
||||
}
|
||||
|
||||
test("GBTRegressor behaves reasonably on toy data") {
|
||||
val df = sqlContext.createDataFrame(Seq(
|
||||
LabeledPoint(10, Vectors.dense(1, 2, 3, 4)),
|
||||
LabeledPoint(-5, Vectors.dense(6, 3, 2, 1)),
|
||||
LabeledPoint(11, Vectors.dense(2, 2, 3, 4)),
|
||||
LabeledPoint(-6, Vectors.dense(6, 4, 2, 1)),
|
||||
LabeledPoint(9, Vectors.dense(1, 2, 6, 4)),
|
||||
LabeledPoint(-4, Vectors.dense(6, 3, 2, 2))
|
||||
))
|
||||
val gbt = new GBTRegressor()
|
||||
.setMaxDepth(2)
|
||||
.setMaxIter(2)
|
||||
val model = gbt.fit(df)
|
||||
val preds = model.transform(df)
|
||||
val predictions = preds.select("prediction").map(_.getDouble(0))
|
||||
// Checks based on SPARK-8736 (to ensure it is not doing classification)
|
||||
assert(predictions.max() > 2)
|
||||
assert(predictions.min() < -1)
|
||||
}
|
||||
|
||||
// TODO: Reinstate test once runWithValidation is implemented SPARK-7132
|
||||
/*
|
||||
test("runWithValidation stops early and performs better on a validation dataset") {
|
||||
|
|
Loading…
Reference in a new issue