[SPARK-8661][ML] for LinearRegressionSuite.scala, changed javadoc-style comments to regular multiline comments, to make copy-pasting R code more simple
for mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala, changed javadoc-style comments to regular multiline comments, to make copy-pasting R code more simple Author: Rosstin <asterazul@gmail.com> Closes #7098 from Rosstin/SPARK-8661 and squashes the following commits: 5a05dee [Rosstin] SPARK-8661 for LinearRegressionSuite.scala, changed javadoc-style comments to regular multiline comments to make it easier to copy-paste the R code. bb9a4b1 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8660 242aedd [Rosstin] SPARK-8660, changed comment style from JavaDoc style to normal multiline comment in order to make copypaste into R easier, in file classification/LogisticRegressionSuite.scala 2cd2985 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639 21ac1e5 [Rosstin] Merge branch 'master' of github.com:apache/spark into SPARK-8639 6c18058 [Rosstin] fixed minor typos in docs/README.md and docs/api.md
This commit is contained in:
parent
ed359de595
commit
4e880cf596
|
@ -28,26 +28,26 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
@transient var dataset: DataFrame = _
|
||||
@transient var datasetWithoutIntercept: DataFrame = _
|
||||
|
||||
/**
|
||||
* In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
|
||||
* is the same as the one trained by R's glmnet package. The following instruction
|
||||
* describes how to reproduce the data in R.
|
||||
*
|
||||
* import org.apache.spark.mllib.util.LinearDataGenerator
|
||||
* val data =
|
||||
* sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2),
|
||||
* Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)
|
||||
* data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1)
|
||||
* .saveAsTextFile("path")
|
||||
/*
|
||||
In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
|
||||
is the same as the one trained by R's glmnet package. The following instruction
|
||||
describes how to reproduce the data in R.
|
||||
|
||||
import org.apache.spark.mllib.util.LinearDataGenerator
|
||||
val data =
|
||||
sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 7.2),
|
||||
Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)
|
||||
data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1)).coalesce(1)
|
||||
.saveAsTextFile("path")
|
||||
*/
|
||||
override def beforeAll(): Unit = {
|
||||
super.beforeAll()
|
||||
dataset = sqlContext.createDataFrame(
|
||||
sc.parallelize(LinearDataGenerator.generateLinearInput(
|
||||
6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
|
||||
/**
|
||||
* datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
|
||||
* training model without intercept
|
||||
/*
|
||||
datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
|
||||
training model without intercept
|
||||
*/
|
||||
datasetWithoutIntercept = sqlContext.createDataFrame(
|
||||
sc.parallelize(LinearDataGenerator.generateLinearInput(
|
||||
|
@ -59,20 +59,20 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
val trainer = new LinearRegression
|
||||
val model = trainer.fit(dataset)
|
||||
|
||||
/**
|
||||
* Using the following R code to load the data and train the model using glmnet package.
|
||||
*
|
||||
* library("glmnet")
|
||||
* data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
|
||||
* features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
|
||||
* label <- as.numeric(data$V1)
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) 6.300528
|
||||
* as.numeric.data.V2. 4.701024
|
||||
* as.numeric.data.V3. 7.198257
|
||||
/*
|
||||
Using the following R code to load the data and train the model using glmnet package.
|
||||
|
||||
library("glmnet")
|
||||
data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
|
||||
features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
|
||||
label <- as.numeric(data$V1)
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) 6.300528
|
||||
as.numeric.data.V2. 4.701024
|
||||
as.numeric.data.V3. 7.198257
|
||||
*/
|
||||
val interceptR = 6.298698
|
||||
val weightsR = Array(4.700706, 7.199082)
|
||||
|
@ -94,29 +94,29 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
val model = trainer.fit(dataset)
|
||||
val modelWithoutIntercept = trainer.fit(datasetWithoutIntercept)
|
||||
|
||||
/**
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
|
||||
* intercept = FALSE))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) .
|
||||
* as.numeric.data.V2. 6.995908
|
||||
* as.numeric.data.V3. 5.275131
|
||||
/*
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
|
||||
intercept = FALSE))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) .
|
||||
as.numeric.data.V2. 6.995908
|
||||
as.numeric.data.V3. 5.275131
|
||||
*/
|
||||
val weightsR = Array(6.995908, 5.275131)
|
||||
|
||||
assert(model.intercept ~== 0 relTol 1E-3)
|
||||
assert(model.weights(0) ~== weightsR(0) relTol 1E-3)
|
||||
assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
|
||||
/**
|
||||
* Then again with the data with no intercept:
|
||||
* > weightsWithoutIntercept
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) .
|
||||
* as.numeric.data3.V2. 4.70011
|
||||
* as.numeric.data3.V3. 7.19943
|
||||
/*
|
||||
Then again with the data with no intercept:
|
||||
> weightsWithoutIntercept
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) .
|
||||
as.numeric.data3.V2. 4.70011
|
||||
as.numeric.data3.V3. 7.19943
|
||||
*/
|
||||
val weightsWithoutInterceptR = Array(4.70011, 7.19943)
|
||||
|
||||
|
@ -129,14 +129,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
val trainer = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
|
||||
val model = trainer.fit(dataset)
|
||||
|
||||
/**
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) 6.24300
|
||||
* as.numeric.data.V2. 4.024821
|
||||
* as.numeric.data.V3. 6.679841
|
||||
/*
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) 6.24300
|
||||
as.numeric.data.V2. 4.024821
|
||||
as.numeric.data.V3. 6.679841
|
||||
*/
|
||||
val interceptR = 6.24300
|
||||
val weightsR = Array(4.024821, 6.679841)
|
||||
|
@ -158,15 +158,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
.setFitIntercept(false)
|
||||
val model = trainer.fit(dataset)
|
||||
|
||||
/**
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
|
||||
* intercept=FALSE))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) .
|
||||
* as.numeric.data.V2. 6.299752
|
||||
* as.numeric.data.V3. 4.772913
|
||||
/*
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
|
||||
intercept=FALSE))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) .
|
||||
as.numeric.data.V2. 6.299752
|
||||
as.numeric.data.V3. 4.772913
|
||||
*/
|
||||
val interceptR = 0.0
|
||||
val weightsR = Array(6.299752, 4.772913)
|
||||
|
@ -187,14 +187,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
val trainer = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
|
||||
val model = trainer.fit(dataset)
|
||||
|
||||
/**
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) 6.328062
|
||||
* as.numeric.data.V2. 3.222034
|
||||
* as.numeric.data.V3. 4.926260
|
||||
/*
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) 6.328062
|
||||
as.numeric.data.V2. 3.222034
|
||||
as.numeric.data.V3. 4.926260
|
||||
*/
|
||||
val interceptR = 5.269376
|
||||
val weightsR = Array(3.736216, 5.712356)
|
||||
|
@ -216,15 +216,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
.setFitIntercept(false)
|
||||
val model = trainer.fit(dataset)
|
||||
|
||||
/**
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
|
||||
* intercept = FALSE))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) .
|
||||
* as.numeric.data.V2. 5.522875
|
||||
* as.numeric.data.V3. 4.214502
|
||||
/*
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
|
||||
intercept = FALSE))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) .
|
||||
as.numeric.data.V2. 5.522875
|
||||
as.numeric.data.V3. 4.214502
|
||||
*/
|
||||
val interceptR = 0.0
|
||||
val weightsR = Array(5.522875, 4.214502)
|
||||
|
@ -245,14 +245,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
val trainer = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
|
||||
val model = trainer.fit(dataset)
|
||||
|
||||
/**
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) 6.324108
|
||||
* as.numeric.data.V2. 3.168435
|
||||
* as.numeric.data.V3. 5.200403
|
||||
/*
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) 6.324108
|
||||
as.numeric.data.V2. 3.168435
|
||||
as.numeric.data.V3. 5.200403
|
||||
*/
|
||||
val interceptR = 5.696056
|
||||
val weightsR = Array(3.670489, 6.001122)
|
||||
|
@ -274,15 +274,15 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
|
|||
.setFitIntercept(false)
|
||||
val model = trainer.fit(dataset)
|
||||
|
||||
/**
|
||||
* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
|
||||
* intercept=FALSE))
|
||||
* > weights
|
||||
* 3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
* s0
|
||||
* (Intercept) .
|
||||
* as.numeric.dataM.V2. 5.673348
|
||||
* as.numeric.dataM.V3. 4.322251
|
||||
/*
|
||||
weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
|
||||
intercept=FALSE))
|
||||
> weights
|
||||
3 x 1 sparse Matrix of class "dgCMatrix"
|
||||
s0
|
||||
(Intercept) .
|
||||
as.numeric.dataM.V2. 5.673348
|
||||
as.numeric.dataM.V3. 4.322251
|
||||
*/
|
||||
val interceptR = 0.0
|
||||
val weightsR = Array(5.673348, 4.322251)
|
||||
|
|
Loading…
Reference in a new issue