diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala index df4dac1e24..b9307ebb37 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/FMRegressor.scala @@ -112,6 +112,10 @@ private[ml] trait FactorizationMachinesParams extends PredictorParams "The solver algorithm for optimization. Supported options: " + s"${supportedSolvers.mkString(", ")}. (Default adamW)", ParamValidators.inArray[String](supportedSolvers)) + + setDefault(factorSize -> 8, fitIntercept -> true, fitLinear -> true, regParam -> 0.0, + miniBatchFraction -> 1.0, initStd -> 0.01, maxIter -> 100, stepSize -> 1.0, tol -> 1E-6, + solver -> AdamW) } private[ml] trait FactorizationMachines extends FactorizationMachinesParams { @@ -308,7 +312,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setFactorSize(value: Int): this.type = set(factorSize, value) - setDefault(factorSize -> 8) /** * Set whether to fit intercept term. @@ -318,7 +321,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) /** * Set whether to fit linear term. @@ -328,7 +330,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setFitLinear(value: Boolean): this.type = set(fitLinear, value) - setDefault(fitLinear -> true) /** * Set the L2 regularization parameter. @@ -338,7 +339,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Set the mini-batch fraction parameter. @@ -348,7 +348,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setMiniBatchFraction(value: Double): this.type = set(miniBatchFraction, value) - setDefault(miniBatchFraction -> 1.0) /** * Set the standard deviation of initial coefficients. @@ -358,7 +357,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setInitStd(value: Double): this.type = set(initStd, value) - setDefault(initStd -> 0.01) /** * Set the maximum number of iterations. @@ -368,7 +366,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) /** * Set the initial step size for the first step (like learning rate). @@ -378,7 +375,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setStepSize(value: Double): this.type = set(stepSize, value) - setDefault(stepSize -> 1.0) /** * Set the convergence tolerance of iterations. @@ -388,7 +384,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Set the solver algorithm used for optimization. @@ -399,7 +394,6 @@ class FMRegressor @Since("3.0.0") ( */ @Since("3.0.0") def setSolver(value: String): this.type = set(solver, value) - setDefault(solver -> AdamW) /** * Set the random seed for weight initialization. diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 8336df8e34..f7dfda81d4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -181,6 +181,9 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam s"${supportedSolvers.mkString(", ")}. (Default irls)", ParamValidators.inArray[String](supportedSolvers)) + setDefault(family -> Gaussian.name, variancePower -> 0.0, maxIter -> 25, tol -> 1E-6, + regParam -> 0.0, solver -> IRLS) + @Since("2.0.0") override def validateAndTransformSchema( schema: StructType, @@ -257,7 +260,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setFamily(value: String): this.type = set(family, value) - setDefault(family -> Gaussian.name) /** * Sets the value of param [[variancePower]]. @@ -268,7 +270,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.2.0") def setVariancePower(value: Double): this.type = set(variancePower, value) - setDefault(variancePower -> 0.0) /** * Sets the value of param [[linkPower]]. @@ -305,7 +306,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 25) /** * Sets the convergence tolerance of iterations. @@ -316,7 +316,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) /** * Sets the regularization parameter for L2 regularization. @@ -332,7 +331,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) /** * Sets the value of param [[weightCol]]. @@ -364,7 +362,6 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val */ @Since("2.0.0") def setSolver(value: String): this.type = set(solver, value) - setDefault(solver -> IRLS) /** * Sets the link prediction (linear predictor) column name. diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index d70932a1bc..cc8ce0567b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -2421,6 +2421,10 @@ class _MultilayerPerceptronParams(_ProbabilisticClassifierParams, HasSeed, HasMa initialWeights = Param(Params._dummy(), "initialWeights", "The initial weights of the model.", typeConverter=TypeConverters.toVector) + def __init__(self): + super(_MultilayerPerceptronParams, self).__init__() + self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs") + @since("1.6.0") def getLayers(self): """ @@ -2524,7 +2528,6 @@ class MultilayerPerceptronClassifier(_JavaProbabilisticClassifier, _MultilayerPe super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs") kwargs = self._input_kwargs self.setParams(**kwargs) @@ -3120,9 +3123,6 @@ class FMClassifier(_JavaProbabilisticClassifier, _FactorizationMachinesParams, J super(FMClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.FMClassifier", self.uid) - self._setDefault(factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0, - miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, - tol=1e-6, solver="adamW") kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index b58255ea12..e82a35c8e7 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1891,6 +1891,11 @@ class _GeneralizedLinearRegressionParams(_PredictorParams, HasFitIntercept, HasM "or empty, we treat all instance offsets as 0.0", typeConverter=TypeConverters.toString) + def __init__(self): + super(_GeneralizedLinearRegressionParams, self).__init__() + self._setDefault(family="gaussian", maxIter=25, tol=1e-6, regParam=0.0, solver="irls", + variancePower=0.0, aggregationDepth=2) + @since("2.0.0") def getFamily(self): """ @@ -2023,8 +2028,6 @@ class GeneralizedLinearRegression(_JavaRegressor, _GeneralizedLinearRegressionPa super(GeneralizedLinearRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.GeneralizedLinearRegression", self.uid) - self._setDefault(family="gaussian", maxIter=25, tol=1e-6, regParam=0.0, solver="irls", - variancePower=0.0, aggregationDepth=2) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -2398,6 +2401,12 @@ class _FactorizationMachinesParams(_PredictorParams, HasMaxIter, HasStepSize, Ha solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " + "options: gd, adamW. (Default adamW)", typeConverter=TypeConverters.toString) + def __init__(self): + super(_FactorizationMachinesParams, self).__init__() + self._setDefault(factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0, + miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, + tol=1e-6, solver="adamW") + @since("3.0.0") def getFactorSize(self): """ @@ -2489,9 +2498,6 @@ class FMRegressor(_JavaRegressor, _FactorizationMachinesParams, JavaMLWritable, super(FMRegressor, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.FMRegressor", self.uid) - self._setDefault(factorSize=8, fitIntercept=True, fitLinear=True, regParam=0.0, - miniBatchFraction=1.0, initStd=0.01, maxIter=100, stepSize=1.0, - tol=1e-6, solver="adamW") kwargs = self._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/tests/test_persistence.py b/python/pyspark/ml/tests/test_persistence.py index d4edcc26e1..2f6d451851 100644 --- a/python/pyspark/ml/tests/test_persistence.py +++ b/python/pyspark/ml/tests/test_persistence.py @@ -21,19 +21,78 @@ import tempfile import unittest from pyspark.ml import Transformer -from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, OneVsRest, \ - OneVsRestModel +from pyspark.ml.classification import DecisionTreeClassifier, FMClassifier, \ + FMClassificationModel, LogisticRegression, MultilayerPerceptronClassifier, \ + MultilayerPerceptronClassificationModel, OneVsRest, OneVsRestModel from pyspark.ml.clustering import KMeans from pyspark.ml.feature import Binarizer, HashingTF, PCA from pyspark.ml.linalg import Vectors from pyspark.ml.param import Params from pyspark.ml.pipeline import Pipeline, PipelineModel -from pyspark.ml.regression import DecisionTreeRegressor, LinearRegression +from pyspark.ml.regression import DecisionTreeRegressor, GeneralizedLinearRegression, \ + GeneralizedLinearRegressionModel, \ + LinearRegression from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWriter from pyspark.ml.wrapper import JavaParams from pyspark.testing.mlutils import MockUnaryTransformer, SparkSessionTestCase +class TestDefaultSolver(SparkSessionTestCase): + + def test_multilayer_load(self): + df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])), + (1.0, Vectors.dense([0.0, 1.0])), + (1.0, Vectors.dense([1.0, 0.0])), + (0.0, Vectors.dense([1.0, 1.0]))], + ["label", "features"]) + + mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123) + model = mlp.fit(df) + self.assertEqual(model.getSolver(), "l-bfgs") + transformed1 = model.transform(df) + path = tempfile.mkdtemp() + model_path = path + "/mlp" + model.save(model_path) + model2 = MultilayerPerceptronClassificationModel.load(model_path) + self.assertEqual(model2.getSolver(), "l-bfgs") + transformed2 = model2.transform(df) + self.assertEqual(transformed1.take(4), transformed2.take(4)) + + def test_fm_load(self): + df = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)), + (0.0, Vectors.sparse(1, [], []))], + ["label", "features"]) + fm = FMClassifier(factorSize=2, maxIter=50, stepSize=2.0) + model = fm.fit(df) + self.assertEqual(model.getSolver(), "adamW") + transformed1 = model.transform(df) + path = tempfile.mkdtemp() + model_path = path + "/fm" + model.save(model_path) + model2 = FMClassificationModel.load(model_path) + self.assertEqual(model2.getSolver(), "adamW") + transformed2 = model2.transform(df) + self.assertEqual(transformed1.take(2), transformed2.take(2)) + + def test_glr_load(self): + df = self.spark.createDataFrame([(1.0, Vectors.dense(0.0, 0.0)), + (1.0, Vectors.dense(1.0, 2.0)), + (2.0, Vectors.dense(0.0, 0.0)), + (2.0, Vectors.dense(1.0, 1.0))], + ["label", "features"]) + glr = GeneralizedLinearRegression(family="gaussian", link="identity", linkPredictionCol="p") + model = glr.fit(df) + self.assertEqual(model.getSolver(), "irls") + transformed1 = model.transform(df) + path = tempfile.mkdtemp() + model_path = path + "/glr" + model.save(model_path) + model2 = GeneralizedLinearRegressionModel.load(model_path) + self.assertEqual(model2.getSolver(), "irls") + transformed2 = model2.transform(df) + self.assertEqual(transformed1.take(4), transformed2.take(4)) + + class PersistenceTest(SparkSessionTestCase): def test_linear_regression(self):