[SPARK-16485][ML][DOC] Fix privacy of GLM members, rename sqlDataTypes for ML, doc fixes
## What changes were proposed in this pull request? Fixing issues found during 2.0 API checks: * GeneralizedLinearRegressionModel: linkObj, familyObj, familyAndLink should not be exposed * sqlDataTypes: name does not follow conventions. Do we need to expose it? * Evaluator: inconsistent doc between evaluate and isLargerBetter * MinMaxScaler: math rendering --> hard to make it great, but I'll change it a little * GeneralizedLinearRegressionSummary: aic doc is incorrect --> will change to use more common name ## How was this patch tested? Existing unit tests. Docs generated locally. (MinMaxScaler is improved a tiny bit.) Author: Joseph K. Bradley <joseph@databricks.com> Closes #14187 from jkbradley/final-api-check-2.0.
This commit is contained in:
parent
c5ec879828
commit
a5f51e2162
|
@ -30,7 +30,8 @@ import org.apache.spark.sql.Dataset
|
|||
abstract class Evaluator extends Params {
|
||||
|
||||
/**
|
||||
* Evaluates model output and returns a scalar metric (larger is better).
|
||||
* Evaluates model output and returns a scalar metric.
|
||||
* The value of [[isLargerBetter]] specifies whether larger values are better.
|
||||
*
|
||||
* @param dataset a dataset that contains labels/observations and predictions.
|
||||
* @param paramMap parameter map that specifies the input columns and output metrics
|
||||
|
@ -42,7 +43,9 @@ abstract class Evaluator extends Params {
|
|||
}
|
||||
|
||||
/**
|
||||
* Evaluates the output.
|
||||
* Evaluates model output and returns a scalar metric.
|
||||
* The value of [[isLargerBetter]] specifies whether larger values are better.
|
||||
*
|
||||
* @param dataset a dataset that contains labels/observations and predictions.
|
||||
* @return metric
|
||||
*/
|
||||
|
|
|
@ -78,9 +78,9 @@ private[feature] trait MinMaxScalerParams extends Params with HasInputCol with H
|
|||
* statistics, which is also known as min-max normalization or Rescaling. The rescaled value for
|
||||
* feature E is calculated as,
|
||||
*
|
||||
* Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
|
||||
* `Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min`
|
||||
*
|
||||
* For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min)
|
||||
* For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)`.
|
||||
* Note that since zero values will probably be transformed to non-zero values, output of the
|
||||
* transformer will be DenseVector even for sparse input.
|
||||
*/
|
||||
|
|
|
@ -17,15 +17,16 @@
|
|||
|
||||
package org.apache.spark.ml.linalg
|
||||
|
||||
import org.apache.spark.annotation.DeveloperApi
|
||||
import org.apache.spark.annotation.{DeveloperApi, Since}
|
||||
import org.apache.spark.sql.types.DataType
|
||||
|
||||
/**
|
||||
* :: DeveloperApi ::
|
||||
* SQL data types for vectors and matrices.
|
||||
*/
|
||||
@Since("2.0.0")
|
||||
@DeveloperApi
|
||||
object sqlDataTypes {
|
||||
object SQLDataTypes {
|
||||
|
||||
/** Data type for [[Vector]]. */
|
||||
val VectorType: DataType = new VectorUDT
|
|
@ -376,7 +376,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
|
|||
def deviance(y: Double, mu: Double, weight: Double): Double
|
||||
|
||||
/**
|
||||
* Akaike's 'An Information Criterion'(AIC) value of the family for a given dataset.
|
||||
* Akaike Information Criterion (AIC) value of the family for a given dataset.
|
||||
*
|
||||
* @param predictions an RDD of (y, mu, weight) of instances in evaluation dataset
|
||||
* @param deviance the deviance for the fitted model in evaluation dataset
|
||||
|
@ -702,13 +702,13 @@ class GeneralizedLinearRegressionModel private[ml] (
|
|||
|
||||
import GeneralizedLinearRegression._
|
||||
|
||||
lazy val familyObj = Family.fromName($(family))
|
||||
lazy val linkObj = if (isDefined(link)) {
|
||||
private lazy val familyObj = Family.fromName($(family))
|
||||
private lazy val linkObj = if (isDefined(link)) {
|
||||
Link.fromName($(link))
|
||||
} else {
|
||||
familyObj.defaultLink
|
||||
}
|
||||
lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj)
|
||||
private lazy val familyAndLink = new FamilyAndLink(familyObj, linkObj)
|
||||
|
||||
override protected def predict(features: Vector): Double = {
|
||||
val eta = predictLink(features)
|
||||
|
@ -1021,7 +1021,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
|
|||
rss / degreesOfFreedom
|
||||
}
|
||||
|
||||
/** Akaike's "An Information Criterion"(AIC) for the fitted model. */
|
||||
/** Akaike Information Criterion (AIC) for the fitted model. */
|
||||
@Since("2.0.0")
|
||||
lazy val aic: Double = {
|
||||
val w = weightCol
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.spark.ml.linalg;
|
|||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.apache.spark.ml.linalg.sqlDataTypes.*;
|
||||
import static org.apache.spark.ml.linalg.SQLDataTypes.*;
|
||||
|
||||
public class JavaSQLDataTypesSuite {
|
||||
@Test
|
||||
|
|
|
@ -21,7 +21,7 @@ import org.apache.spark.SparkFunSuite
|
|||
|
||||
class SQLDataTypesSuite extends SparkFunSuite {
|
||||
test("sqlDataTypes") {
|
||||
assert(sqlDataTypes.VectorType === new VectorUDT)
|
||||
assert(sqlDataTypes.MatrixType === new MatrixUDT)
|
||||
assert(SQLDataTypes.VectorType === new VectorUDT)
|
||||
assert(SQLDataTypes.MatrixType === new MatrixUDT)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue