spark-instrumented-optimizer/python/pyspark/ml/tests/test_training_summary.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys
import unittest

if sys.version > '3':
    basestring = str

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import BisectingKMeans, GaussianMixture, KMeans
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression
from pyspark.sql import DataFrame
from pyspark.testing.mlutils import SparkSessionTestCase


class TrainingSummaryTest(SparkSessionTestCase):

    def test_linear_regression_summary(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)
        model = lr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertGreater(s.totalIterations, 0)
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.predictionCol, "prediction")
        self.assertEqual(s.labelCol, "label")
        self.assertEqual(s.featuresCol, "features")
        objHist = s.objectiveHistory
        self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
        self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
        self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
        self.assertAlmostEqual(s.meanSquaredError, 0.0)
        self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
        self.assertAlmostEqual(s.r2, 1.0, 2)
        self.assertAlmostEqual(s.r2adj, 1.0, 2)
        self.assertTrue(isinstance(s.residuals, DataFrame))
        self.assertEqual(s.numInstances, 2)
        self.assertEqual(s.degreesOfFreedom, 1)
        devResiduals = s.devianceResiduals
        self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
        coefStdErr = s.coefficientStandardErrors
        self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
        tValues = s.tValues
        self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
        pValues = s.pValues
        self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned
        # The child class LinearRegressionTrainingSummary runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)

    def test_glr_summary(self):
        from pyspark.ml.linalg import Vectors
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                          fitIntercept=False)
        model = glr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.predictionCol, "prediction")
        self.assertEqual(s.numInstances, 2)
        self.assertTrue(isinstance(s.residuals(), DataFrame))
        self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
        coefStdErr = s.coefficientStandardErrors
        self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
        tValues = s.tValues
        self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
        pValues = s.pValues
        self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
        self.assertEqual(s.degreesOfFreedom, 1)
        self.assertEqual(s.residualDegreeOfFreedom, 1)
        self.assertEqual(s.residualDegreeOfFreedomNull, 2)
        self.assertEqual(s.rank, 1)
        self.assertTrue(isinstance(s.solver, basestring))
        self.assertTrue(isinstance(s.aic, float))
        self.assertTrue(isinstance(s.deviance, float))
        self.assertTrue(isinstance(s.nullDeviance, float))
        self.assertTrue(isinstance(s.dispersion, float))
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned
        # The child class GeneralizedLinearRegressionTrainingSummary runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.deviance, s.deviance)

    def test_binary_logistic_regression_summary(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
        model = lr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertEqual(s.labelCol, "label")
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        objHist = s.objectiveHistory
        self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
        self.assertGreater(s.totalIterations, 0)
        self.assertTrue(isinstance(s.labels, list))
        self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.precisionByLabel, list))
        self.assertTrue(isinstance(s.recallByLabel, list))
        self.assertTrue(isinstance(s.fMeasureByLabel(), list))
        self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
        self.assertTrue(isinstance(s.roc, DataFrame))
        self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
        self.assertTrue(isinstance(s.pr, DataFrame))
        self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
        self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
        self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
        self.assertAlmostEqual(s.accuracy, 1.0, 2)
        self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
        self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
        self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
        self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
        self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
        self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned, Scala version runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)

    def test_multiclass_logistic_regression_summary(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], [])),
                                         (2.0, 2.0, Vectors.dense(2.0)),
                                         (2.0, 2.0, Vectors.dense(1.9))],
                                        ["label", "weight", "features"])
        lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
        model = lr.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        # test that api is callable and returns expected types
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertEqual(s.labelCol, "label")
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        objHist = s.objectiveHistory
        self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
        self.assertGreater(s.totalIterations, 0)
        self.assertTrue(isinstance(s.labels, list))
        self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
        self.assertTrue(isinstance(s.precisionByLabel, list))
        self.assertTrue(isinstance(s.recallByLabel, list))
        self.assertTrue(isinstance(s.fMeasureByLabel(), list))
        self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
        self.assertAlmostEqual(s.accuracy, 0.75, 2)
        self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)
        self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)
        self.assertAlmostEqual(s.weightedRecall, 0.75, 2)
        self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)
        self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)
        self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)
        # test evaluation (with training dataset) produces a summary with same values
        # one check is enough to verify a summary is returned, Scala version runs full test
        sameSummary = model.evaluate(df)
        self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)

    def test_gaussian_mixture_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        gmm = GaussianMixture(k=2)
        model = gmm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertTrue(isinstance(s.probability, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 3)

    def test_bisecting_kmeans_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        bkm = BisectingKMeans(k=2)
        model = bkm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 20)

    def test_kmeans_summary(self):
        data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
                (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
        df = self.spark.createDataFrame(data, ["features"])
        kmeans = KMeans(k=2, seed=1)
        model = kmeans.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 1)


if __name__ == "__main__":
    from pyspark.ml.tests.test_training_summary import *

    try:
        import xmlrunner
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
[SPARK-26033][PYTHON][TESTS] Break large ml/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large ml/tests.py file that contains all Python ML unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── ml │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_base.py │ │ ├── test_evaluation.py │ │ ├── test_feature.py │ │ ├── test_image.py │ │ ├── test_linalg.py │ │ ├── test_param.py │ │ ├── test_persistence.py │ │ ├── test_pipeline.py │ │ ├── test_stat.py │ │ ├── test_training_summary.py │ │ ├── test_tuning.py │ │ └── test_wrapper.py ... ├── testing ... │ ├── mlutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-ml` to verify all passing with Python 2.7 and Python 3.6. Closes #23063 from BryanCutler/python-test-breakup-ml-SPARK-26033. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-18 03:02:15 -05:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`import sys`
[SPARK-26105][PYTHON] Clean unittest2 imports up that were added for Python 2.6 before ## What changes were proposed in this pull request? Currently, some of PySpark tests sill assume the tests could be ran in Python 2.6 by importing `unittest2`. For instance: ```python if sys.version_info[:2] <= (2, 6): try: import unittest2 as unittest except ImportError: sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') sys.exit(1) else: import unittest ``` While I am here, I removed some of unused imports and reordered imports per PEP 8. We officially dropped Python 2.6 support a while ago and started to discuss about Python 2 drop. It's better to remove them out. ## How was this patch tested? Manually tests, and existing tests via Jenkins. Closes #23077 from HyukjinKwon/SPARK-26105. Lead-authored-by: hyukjinkwon <gurwls223@apache.org> Co-authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-18 20:22:32 -05:00			`import unittest`
[SPARK-26033][PYTHON][TESTS] Break large ml/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large ml/tests.py file that contains all Python ML unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── ml │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_base.py │ │ ├── test_evaluation.py │ │ ├── test_feature.py │ │ ├── test_image.py │ │ ├── test_linalg.py │ │ ├── test_param.py │ │ ├── test_persistence.py │ │ ├── test_pipeline.py │ │ ├── test_stat.py │ │ ├── test_training_summary.py │ │ ├── test_tuning.py │ │ └── test_wrapper.py ... ├── testing ... │ ├── mlutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-ml` to verify all passing with Python 2.7 and Python 3.6. Closes #23063 from BryanCutler/python-test-breakup-ml-SPARK-26033. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-18 03:02:15 -05:00
			`if sys.version > '3':`
			`basestring = str`

			`from pyspark.ml.classification import LogisticRegression`
			`from pyspark.ml.clustering import BisectingKMeans, GaussianMixture, KMeans`
			`from pyspark.ml.linalg import Vectors`
			`from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression`
			`from pyspark.sql import DataFrame`
			`from pyspark.testing.mlutils import SparkSessionTestCase`


			`class TrainingSummaryTest(SparkSessionTestCase):`

			`def test_linear_regression_summary(self):`
			`df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),`
			`(0.0, 2.0, Vectors.sparse(1, [], []))],`
			`["label", "weight", "features"])`
			`lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",`
			`fitIntercept=False)`
			`model = lr.fit(df)`
			`self.assertTrue(model.hasSummary)`
			`s = model.summary`
			`# test that api is callable and returns expected types`
			`self.assertGreater(s.totalIterations, 0)`
			`self.assertTrue(isinstance(s.predictions, DataFrame))`
			`self.assertEqual(s.predictionCol, "prediction")`
			`self.assertEqual(s.labelCol, "label")`
			`self.assertEqual(s.featuresCol, "features")`
			`objHist = s.objectiveHistory`
			`self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))`
			`self.assertAlmostEqual(s.explainedVariance, 0.25, 2)`
			`self.assertAlmostEqual(s.meanAbsoluteError, 0.0)`
			`self.assertAlmostEqual(s.meanSquaredError, 0.0)`
			`self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)`
			`self.assertAlmostEqual(s.r2, 1.0, 2)`
			`self.assertAlmostEqual(s.r2adj, 1.0, 2)`
			`self.assertTrue(isinstance(s.residuals, DataFrame))`
			`self.assertEqual(s.numInstances, 2)`
			`self.assertEqual(s.degreesOfFreedom, 1)`
			`devResiduals = s.devianceResiduals`
			`self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))`
			`coefStdErr = s.coefficientStandardErrors`
			`self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))`
			`tValues = s.tValues`
			`self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))`
			`pValues = s.pValues`
			`self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))`
			`# test evaluation (with training dataset) produces a summary with same values`
			`# one check is enough to verify a summary is returned`
			`# The child class LinearRegressionTrainingSummary runs full test`
			`sameSummary = model.evaluate(df)`
			`self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)`

			`def test_glr_summary(self):`
			`from pyspark.ml.linalg import Vectors`
			`df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),`
			`(0.0, 2.0, Vectors.sparse(1, [], []))],`
			`["label", "weight", "features"])`
			`glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",`
			`fitIntercept=False)`
			`model = glr.fit(df)`
			`self.assertTrue(model.hasSummary)`
			`s = model.summary`
			`# test that api is callable and returns expected types`
			`self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS`
			`self.assertTrue(isinstance(s.predictions, DataFrame))`
			`self.assertEqual(s.predictionCol, "prediction")`
			`self.assertEqual(s.numInstances, 2)`
			`self.assertTrue(isinstance(s.residuals(), DataFrame))`
			`self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))`
			`coefStdErr = s.coefficientStandardErrors`
			`self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))`
			`tValues = s.tValues`
			`self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))`
			`pValues = s.pValues`
			`self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))`
			`self.assertEqual(s.degreesOfFreedom, 1)`
			`self.assertEqual(s.residualDegreeOfFreedom, 1)`
			`self.assertEqual(s.residualDegreeOfFreedomNull, 2)`
			`self.assertEqual(s.rank, 1)`
			`self.assertTrue(isinstance(s.solver, basestring))`
			`self.assertTrue(isinstance(s.aic, float))`
			`self.assertTrue(isinstance(s.deviance, float))`
			`self.assertTrue(isinstance(s.nullDeviance, float))`
			`self.assertTrue(isinstance(s.dispersion, float))`
			`# test evaluation (with training dataset) produces a summary with same values`
			`# one check is enough to verify a summary is returned`
			`# The child class GeneralizedLinearRegressionTrainingSummary runs full test`
			`sameSummary = model.evaluate(df)`
			`self.assertAlmostEqual(sameSummary.deviance, s.deviance)`

			`def test_binary_logistic_regression_summary(self):`
			`df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),`
			`(0.0, 2.0, Vectors.sparse(1, [], []))],`
			`["label", "weight", "features"])`
			`lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)`
			`model = lr.fit(df)`
			`self.assertTrue(model.hasSummary)`
			`s = model.summary`
			`# test that api is callable and returns expected types`
			`self.assertTrue(isinstance(s.predictions, DataFrame))`
			`self.assertEqual(s.probabilityCol, "probability")`
			`self.assertEqual(s.labelCol, "label")`
			`self.assertEqual(s.featuresCol, "features")`
			`self.assertEqual(s.predictionCol, "prediction")`
			`objHist = s.objectiveHistory`
			`self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))`
			`self.assertGreater(s.totalIterations, 0)`
			`self.assertTrue(isinstance(s.labels, list))`
			`self.assertTrue(isinstance(s.truePositiveRateByLabel, list))`
			`self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))`
			`self.assertTrue(isinstance(s.precisionByLabel, list))`
			`self.assertTrue(isinstance(s.recallByLabel, list))`
			`self.assertTrue(isinstance(s.fMeasureByLabel(), list))`
			`self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))`
			`self.assertTrue(isinstance(s.roc, DataFrame))`
			`self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)`
			`self.assertTrue(isinstance(s.pr, DataFrame))`
			`self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))`
			`self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))`
			`self.assertTrue(isinstance(s.recallByThreshold, DataFrame))`
			`self.assertAlmostEqual(s.accuracy, 1.0, 2)`
			`self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)`
			`self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)`
			`self.assertAlmostEqual(s.weightedRecall, 1.0, 2)`
			`self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)`
			`self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)`
			`self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)`
			`# test evaluation (with training dataset) produces a summary with same values`
			`# one check is enough to verify a summary is returned, Scala version runs full test`
			`sameSummary = model.evaluate(df)`
			`self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)`

			`def test_multiclass_logistic_regression_summary(self):`
			`df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),`
			`(0.0, 2.0, Vectors.sparse(1, [], [])),`
			`(2.0, 2.0, Vectors.dense(2.0)),`
			`(2.0, 2.0, Vectors.dense(1.9))],`
			`["label", "weight", "features"])`
			`lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)`
			`model = lr.fit(df)`
			`self.assertTrue(model.hasSummary)`
			`s = model.summary`
			`# test that api is callable and returns expected types`
			`self.assertTrue(isinstance(s.predictions, DataFrame))`
			`self.assertEqual(s.probabilityCol, "probability")`
			`self.assertEqual(s.labelCol, "label")`
			`self.assertEqual(s.featuresCol, "features")`
			`self.assertEqual(s.predictionCol, "prediction")`
			`objHist = s.objectiveHistory`
			`self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))`
			`self.assertGreater(s.totalIterations, 0)`
			`self.assertTrue(isinstance(s.labels, list))`
			`self.assertTrue(isinstance(s.truePositiveRateByLabel, list))`
			`self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))`
			`self.assertTrue(isinstance(s.precisionByLabel, list))`
			`self.assertTrue(isinstance(s.recallByLabel, list))`
			`self.assertTrue(isinstance(s.fMeasureByLabel(), list))`
			`self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))`
			`self.assertAlmostEqual(s.accuracy, 0.75, 2)`
			`self.assertAlmostEqual(s.weightedTruePositiveRate, 0.75, 2)`
			`self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.25, 2)`
			`self.assertAlmostEqual(s.weightedRecall, 0.75, 2)`
			`self.assertAlmostEqual(s.weightedPrecision, 0.583, 2)`
			`self.assertAlmostEqual(s.weightedFMeasure(), 0.65, 2)`
			`self.assertAlmostEqual(s.weightedFMeasure(1.0), 0.65, 2)`
			`# test evaluation (with training dataset) produces a summary with same values`
			`# one check is enough to verify a summary is returned, Scala version runs full test`
			`sameSummary = model.evaluate(df)`
			`self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)`

			`def test_gaussian_mixture_summary(self):`
			`data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),`
			`(Vectors.sparse(1, [], []),)]`
			`df = self.spark.createDataFrame(data, ["features"])`
			`gmm = GaussianMixture(k=2)`
			`model = gmm.fit(df)`
			`self.assertTrue(model.hasSummary)`
			`s = model.summary`
			`self.assertTrue(isinstance(s.predictions, DataFrame))`
			`self.assertEqual(s.probabilityCol, "probability")`
			`self.assertTrue(isinstance(s.probability, DataFrame))`
			`self.assertEqual(s.featuresCol, "features")`
			`self.assertEqual(s.predictionCol, "prediction")`
			`self.assertTrue(isinstance(s.cluster, DataFrame))`
			`self.assertEqual(len(s.clusterSizes), 2)`
			`self.assertEqual(s.k, 2)`
			`self.assertEqual(s.numIter, 3)`

			`def test_bisecting_kmeans_summary(self):`
			`data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),`
			`(Vectors.sparse(1, [], []),)]`
			`df = self.spark.createDataFrame(data, ["features"])`
			`bkm = BisectingKMeans(k=2)`
			`model = bkm.fit(df)`
			`self.assertTrue(model.hasSummary)`
			`s = model.summary`
			`self.assertTrue(isinstance(s.predictions, DataFrame))`
			`self.assertEqual(s.featuresCol, "features")`
			`self.assertEqual(s.predictionCol, "prediction")`
			`self.assertTrue(isinstance(s.cluster, DataFrame))`
			`self.assertEqual(len(s.clusterSizes), 2)`
			`self.assertEqual(s.k, 2)`
			`self.assertEqual(s.numIter, 20)`

			`def test_kmeans_summary(self):`
			`data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),`
			`(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]`
			`df = self.spark.createDataFrame(data, ["features"])`
			`kmeans = KMeans(k=2, seed=1)`
			`model = kmeans.fit(df)`
			`self.assertTrue(model.hasSummary)`
			`s = model.summary`
			`self.assertTrue(isinstance(s.predictions, DataFrame))`
			`self.assertEqual(s.featuresCol, "features")`
			`self.assertEqual(s.predictionCol, "prediction")`
			`self.assertTrue(isinstance(s.cluster, DataFrame))`
			`self.assertEqual(len(s.clusterSizes), 2)`
			`self.assertEqual(s.k, 2)`
			`self.assertEqual(s.numIter, 1)`


			`if __name__ == "__main__":`
			`from pyspark.ml.tests.test_training_summary import *`

			`try:`
			`import xmlrunner`
[SPARK-28130][PYTHON] Print pretty messages for skipped tests when xmlrunner is available in PySpark ## What changes were proposed in this pull request? Currently, pretty skipped message added by https://github.com/apache/spark/commit/f7435bec6a9348cfbbe26b13c230c08545d16067 mechanism seems not working when xmlrunner is installed apparently. This PR fixes two things: 1. When `xmlrunner` is installed, seems `xmlrunner` does not respect `vervosity` level in unittests (default is level 1). So the output looks as below ``` Running tests... ---------------------------------------------------------------------- SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS ---------------------------------------------------------------------- ``` So it is not caught by our message detection mechanism. 2. If we manually set the `vervocity` level to `xmlrunner`, it prints messages as below: ``` test_mixed_udf (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s) test_mixed_udf_and_sql (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s) ... ``` This is different in our Jenkins machine: ``` test_createDataFrame_column_name_encoding (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.' test_createDataFrame_does_not_modify_input (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.' ... ``` Note that last `SKIP` is different. This PR fixes the regular expression to catch `SKIP` case as well. ## How was this patch tested? Manually tested. Before: ``` Starting test(python2.7): pyspark.... Finished test(python2.7): pyspark.... (0s) ... Tests passed in 562 seconds ======================================================================== ... ``` After: ``` Starting test(python2.7): pyspark.... Finished test(python2.7): pyspark.... (48s) ... 93 tests were skipped ... Tests passed in 560 seconds Skipped tests pyspark.... with python2.7: pyspark...(...) ... SKIP (0.000s) ... ======================================================================== ... ``` Closes #24927 from HyukjinKwon/SPARK-28130. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2019-06-23 20:58:17 -04:00			`testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)`
[SPARK-26033][PYTHON][TESTS] Break large ml/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large ml/tests.py file that contains all Python ML unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── ml │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_base.py │ │ ├── test_evaluation.py │ │ ├── test_feature.py │ │ ├── test_image.py │ │ ├── test_linalg.py │ │ ├── test_param.py │ │ ├── test_persistence.py │ │ ├── test_pipeline.py │ │ ├── test_stat.py │ │ ├── test_training_summary.py │ │ ├── test_tuning.py │ │ └── test_wrapper.py ... ├── testing ... │ ├── mlutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-ml` to verify all passing with Python 2.7 and Python 3.6. Closes #23063 from BryanCutler/python-test-breakup-ml-SPARK-26033. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-18 03:02:15 -05:00			`except ImportError:`
			`testRunner = None`
			`unittest.main(testRunner=testRunner, verbosity=2)`