spark-instrumented-optimizer/python/pyspark/mllib/tests/test_stat.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import array as pyarray
import unittest

from numpy import array

from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \
    DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT
from pyspark.mllib.random import RandomRDDs
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
from pyspark.sql.utils import IllegalArgumentException
from pyspark.testing.mllibutils import MLlibTestCase


class StatTests(MLlibTestCase):
    # SPARK-4023
    def test_col_with_different_rdds(self):
        # numpy
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(1000, summary.count())
        # array
        data = self.sc.parallelize([range(10)] * 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, summary.count())
        # array
        data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, summary.count())

    def test_col_norms(self):
        data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)
        summary = Statistics.colStats(data)
        self.assertEqual(10, len(summary.normL1()))
        self.assertEqual(10, len(summary.normL2()))

        data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))
        summary2 = Statistics.colStats(data2)
        self.assertEqual(array([45.0]), summary2.normL1())
        import math
        expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))
        self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)


class ChiSqTestTests(MLlibTestCase):
    def test_goodness_of_fit(self):
        from numpy import inf

        observed = Vectors.dense([4, 6, 5])
        pearson = Statistics.chiSqTest(observed)

        # Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
        self.assertEqual(pearson.statistic, 0.4)
        self.assertEqual(pearson.degreesOfFreedom, 2)
        self.assertAlmostEqual(pearson.pValue, 0.8187, 4)

        # Different expected and observed sum
        observed1 = Vectors.dense([21, 38, 43, 80])
        expected1 = Vectors.dense([3, 5, 7, 20])
        pearson1 = Statistics.chiSqTest(observed1, expected1)

        # Results validated against the R command
        # `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
        self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)
        self.assertEqual(pearson1.degreesOfFreedom, 3)
        self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)

        # Vectors with different sizes
        observed3 = Vectors.dense([1.0, 2.0, 3.0])
        expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])
        self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)

        # Negative counts in observed
        neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)

        # Count = 0.0 in expected but not observed
        zero_expected = Vectors.dense([1.0, 0.0, 3.0])
        pearson_inf = Statistics.chiSqTest(observed, zero_expected)
        self.assertEqual(pearson_inf.statistic, inf)
        self.assertEqual(pearson_inf.degreesOfFreedom, 2)
        self.assertEqual(pearson_inf.pValue, 0.0)

        # 0.0 in expected and observed simultaneously
        zero_observed = Vectors.dense([2.0, 0.0, 1.0])
        self.assertRaises(
            IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)

    def test_matrix_independence(self):
        data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]
        chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))

        # Results validated against R command
        # `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
        self.assertAlmostEqual(chi.statistic, 21.9958, 4)
        self.assertEqual(chi.degreesOfFreedom, 6)
        self.assertAlmostEqual(chi.pValue, 0.001213, 4)

        # Negative counts
        neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts)

        # Row sum = 0.0
        row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero)

        # Column sum = 0.0
        col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])
        self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)

    def test_chi_sq_pearson(self):
        data = [
            LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
            LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
            LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
            LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
            LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
        ]

        for numParts in [2, 4, 6, 8]:
            chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
            feature1 = chi[0]
            self.assertEqual(feature1.statistic, 0.75)
            self.assertEqual(feature1.degreesOfFreedom, 2)
            self.assertAlmostEqual(feature1.pValue, 0.6873, 4)

            feature2 = chi[1]
            self.assertEqual(feature2.statistic, 1.5)
            self.assertEqual(feature2.degreesOfFreedom, 3)
            self.assertAlmostEqual(feature2.pValue, 0.6823, 4)

    def test_right_number_of_results(self):
        num_cols = 1001
        sparse_data = [
            LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
            LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
        ]
        chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
        self.assertEqual(len(chi), num_cols)
        self.assertIsNotNone(chi[1000])


class KolmogorovSmirnovTest(MLlibTestCase):

    def test_R_implementation_equivalence(self):
        data = self.sc.parallelize([
            1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
            -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
            -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
            -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
            0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
        ])
        model = Statistics.kolmogorovSmirnovTest(data, "norm")
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)

        model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
        self.assertAlmostEqual(model.statistic, 0.189, 3)
        self.assertAlmostEqual(model.pValue, 0.422, 3)


if __name__ == "__main__":
    from pyspark.mllib.tests.test_stat import *

    try:
        import xmlrunner
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
[SPARK-26034][PYTHON][TESTS] Break large mllib/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large mllib/tests.py file that contains all Python MLlib unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── mllib │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_feature.py │ │ ├── test_linalg.py │ │ ├── test_stat.py │ │ ├── test_streaming_algorithms.py │ │ └── test_util.py ... ├── testing ... │ ├── mllibutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-mllib` to verify all passing with Python 2.7 and Python 3.6. Also installed scipy to include optional tests in test_linalg. Closes #23056 from BryanCutler/python-test-breakup-mllib-SPARK-26034. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-16 11:12:17 -05:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`import array as pyarray`
[SPARK-26105][PYTHON] Clean unittest2 imports up that were added for Python 2.6 before ## What changes were proposed in this pull request? Currently, some of PySpark tests sill assume the tests could be ran in Python 2.6 by importing `unittest2`. For instance: ```python if sys.version_info[:2] <= (2, 6): try: import unittest2 as unittest except ImportError: sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') sys.exit(1) else: import unittest ``` While I am here, I removed some of unused imports and reordered imports per PEP 8. We officially dropped Python 2.6 support a while ago and started to discuss about Python 2 drop. It's better to remove them out. ## How was this patch tested? Manually tests, and existing tests via Jenkins. Closes #23077 from HyukjinKwon/SPARK-26105. Lead-authored-by: hyukjinkwon <gurwls223@apache.org> Co-authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-18 20:22:32 -05:00			`import unittest`
[SPARK-26034][PYTHON][TESTS] Break large mllib/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large mllib/tests.py file that contains all Python MLlib unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── mllib │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_feature.py │ │ ├── test_linalg.py │ │ ├── test_stat.py │ │ ├── test_streaming_algorithms.py │ │ └── test_util.py ... ├── testing ... │ ├── mllibutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-mllib` to verify all passing with Python 2.7 and Python 3.6. Also installed scipy to include optional tests in test_linalg. Closes #23056 from BryanCutler/python-test-breakup-mllib-SPARK-26034. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-16 11:12:17 -05:00
			`from numpy import array`

			`from pyspark.mllib.linalg import Vector, SparseVector, DenseVector, VectorUDT, _convert_to_vector, \`
			`DenseMatrix, SparseMatrix, Vectors, Matrices, MatrixUDT`
			`from pyspark.mllib.random import RandomRDDs`
			`from pyspark.mllib.regression import LabeledPoint`
			`from pyspark.mllib.stat import Statistics`
			`from pyspark.sql.utils import IllegalArgumentException`
			`from pyspark.testing.mllibutils import MLlibTestCase`


			`class StatTests(MLlibTestCase):`
			`# SPARK-4023`
			`def test_col_with_different_rdds(self):`
			`# numpy`
			`data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)`
			`summary = Statistics.colStats(data)`
			`self.assertEqual(1000, summary.count())`
			`# array`
			`data = self.sc.parallelize([range(10)] * 10)`
			`summary = Statistics.colStats(data)`
			`self.assertEqual(10, summary.count())`
			`# array`
			`data = self.sc.parallelize([pyarray.array("d", range(10))] * 10)`
			`summary = Statistics.colStats(data)`
			`self.assertEqual(10, summary.count())`

			`def test_col_norms(self):`
			`data = RandomRDDs.normalVectorRDD(self.sc, 1000, 10, 10)`
			`summary = Statistics.colStats(data)`
			`self.assertEqual(10, len(summary.normL1()))`
			`self.assertEqual(10, len(summary.normL2()))`

			`data2 = self.sc.parallelize(range(10)).map(lambda x: Vectors.dense(x))`
			`summary2 = Statistics.colStats(data2)`
			`self.assertEqual(array([45.0]), summary2.normL1())`
			`import math`
			`expectedNormL2 = math.sqrt(sum(map(lambda x: x*x, range(10))))`
			`self.assertTrue(math.fabs(summary2.normL2()[0] - expectedNormL2) < 1e-14)`


			`class ChiSqTestTests(MLlibTestCase):`
			`def test_goodness_of_fit(self):`
			`from numpy import inf`

			`observed = Vectors.dense([4, 6, 5])`
			`pearson = Statistics.chiSqTest(observed)`

			# Validated against the R command `chisq.test(c(4, 6, 5), p=c(1/3, 1/3, 1/3))`
			`self.assertEqual(pearson.statistic, 0.4)`
			`self.assertEqual(pearson.degreesOfFreedom, 2)`
			`self.assertAlmostEqual(pearson.pValue, 0.8187, 4)`

			`# Different expected and observed sum`
			`observed1 = Vectors.dense([21, 38, 43, 80])`
			`expected1 = Vectors.dense([3, 5, 7, 20])`
			`pearson1 = Statistics.chiSqTest(observed1, expected1)`

			`# Results validated against the R command`
			# `chisq.test(c(21, 38, 43, 80), p=c(3/35, 1/7, 1/5, 4/7))`
			`self.assertAlmostEqual(pearson1.statistic, 14.1429, 4)`
			`self.assertEqual(pearson1.degreesOfFreedom, 3)`
			`self.assertAlmostEqual(pearson1.pValue, 0.002717, 4)`

			`# Vectors with different sizes`
			`observed3 = Vectors.dense([1.0, 2.0, 3.0])`
			`expected3 = Vectors.dense([1.0, 2.0, 3.0, 4.0])`
			`self.assertRaises(ValueError, Statistics.chiSqTest, observed3, expected3)`

			`# Negative counts in observed`
			`neg_obs = Vectors.dense([1.0, 2.0, 3.0, -4.0])`
			`self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_obs, expected1)`

			`# Count = 0.0 in expected but not observed`
			`zero_expected = Vectors.dense([1.0, 0.0, 3.0])`
			`pearson_inf = Statistics.chiSqTest(observed, zero_expected)`
			`self.assertEqual(pearson_inf.statistic, inf)`
			`self.assertEqual(pearson_inf.degreesOfFreedom, 2)`
			`self.assertEqual(pearson_inf.pValue, 0.0)`

			`# 0.0 in expected and observed simultaneously`
			`zero_observed = Vectors.dense([2.0, 0.0, 1.0])`
			`self.assertRaises(`
			`IllegalArgumentException, Statistics.chiSqTest, zero_observed, zero_expected)`

			`def test_matrix_independence(self):`
			`data = [40.0, 24.0, 29.0, 56.0, 32.0, 42.0, 31.0, 10.0, 0.0, 30.0, 15.0, 12.0]`
			`chi = Statistics.chiSqTest(Matrices.dense(3, 4, data))`

			`# Results validated against R command`
			# `chisq.test(rbind(c(40, 56, 31, 30),c(24, 32, 10, 15), c(29, 42, 0, 12)))`
			`self.assertAlmostEqual(chi.statistic, 21.9958, 4)`
			`self.assertEqual(chi.degreesOfFreedom, 6)`
			`self.assertAlmostEqual(chi.pValue, 0.001213, 4)`

			`# Negative counts`
			`neg_counts = Matrices.dense(2, 2, [4.0, 5.0, 3.0, -3.0])`
			`self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, neg_counts)`

			`# Row sum = 0.0`
			`row_zero = Matrices.dense(2, 2, [0.0, 1.0, 0.0, 2.0])`
			`self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, row_zero)`

			`# Column sum = 0.0`
			`col_zero = Matrices.dense(2, 2, [0.0, 0.0, 2.0, 2.0])`
			`self.assertRaises(IllegalArgumentException, Statistics.chiSqTest, col_zero)`

			`def test_chi_sq_pearson(self):`
			`data = [`
			`LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),`
			`LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),`
			`LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),`
			`LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),`
			`LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),`
			`LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))`
			`]`

			`for numParts in [2, 4, 6, 8]:`
			`chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))`
			`feature1 = chi[0]`
			`self.assertEqual(feature1.statistic, 0.75)`
			`self.assertEqual(feature1.degreesOfFreedom, 2)`
			`self.assertAlmostEqual(feature1.pValue, 0.6873, 4)`

			`feature2 = chi[1]`
			`self.assertEqual(feature2.statistic, 1.5)`
			`self.assertEqual(feature2.degreesOfFreedom, 3)`
			`self.assertAlmostEqual(feature2.pValue, 0.6823, 4)`

			`def test_right_number_of_results(self):`
			`num_cols = 1001`
			`sparse_data = [`
			`LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),`
			`LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))`
			`]`
			`chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))`
			`self.assertEqual(len(chi), num_cols)`
			`self.assertIsNotNone(chi[1000])`


			`class KolmogorovSmirnovTest(MLlibTestCase):`

			`def test_R_implementation_equivalence(self):`
			`data = self.sc.parallelize([`
			`1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,`
			`-0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,`
			`-0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,`
			`-0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,`
			`0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942`
			`])`
			`model = Statistics.kolmogorovSmirnovTest(data, "norm")`
			`self.assertAlmostEqual(model.statistic, 0.189, 3)`
			`self.assertAlmostEqual(model.pValue, 0.422, 3)`

			`model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)`
			`self.assertAlmostEqual(model.statistic, 0.189, 3)`
			`self.assertAlmostEqual(model.pValue, 0.422, 3)`


			`if __name__ == "__main__":`
			`from pyspark.mllib.tests.test_stat import *`

			`try:`
			`import xmlrunner`
[SPARK-28130][PYTHON] Print pretty messages for skipped tests when xmlrunner is available in PySpark ## What changes were proposed in this pull request? Currently, pretty skipped message added by https://github.com/apache/spark/commit/f7435bec6a9348cfbbe26b13c230c08545d16067 mechanism seems not working when xmlrunner is installed apparently. This PR fixes two things: 1. When `xmlrunner` is installed, seems `xmlrunner` does not respect `vervosity` level in unittests (default is level 1). So the output looks as below ``` Running tests... ---------------------------------------------------------------------- SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS ---------------------------------------------------------------------- ``` So it is not caught by our message detection mechanism. 2. If we manually set the `vervocity` level to `xmlrunner`, it prints messages as below: ``` test_mixed_udf (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s) test_mixed_udf_and_sql (pyspark.sql.tests.test_pandas_udf_scalar.ScalarPandasUDFTests) ... SKIP (0.000s) ... ``` This is different in our Jenkins machine: ``` test_createDataFrame_column_name_encoding (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.' test_createDataFrame_does_not_modify_input (pyspark.sql.tests.test_arrow.ArrowTests) ... skipped 'Pandas >= 0.23.2 must be installed; however, it was not found.' ... ``` Note that last `SKIP` is different. This PR fixes the regular expression to catch `SKIP` case as well. ## How was this patch tested? Manually tested. Before: ``` Starting test(python2.7): pyspark.... Finished test(python2.7): pyspark.... (0s) ... Tests passed in 562 seconds ======================================================================== ... ``` After: ``` Starting test(python2.7): pyspark.... Finished test(python2.7): pyspark.... (48s) ... 93 tests were skipped ... Tests passed in 560 seconds Skipped tests pyspark.... with python2.7: pyspark...(...) ... SKIP (0.000s) ... ======================================================================== ... ``` Closes #24927 from HyukjinKwon/SPARK-28130. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org> 2019-06-23 20:58:17 -04:00			`testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)`
[SPARK-26034][PYTHON][TESTS] Break large mllib/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large mllib/tests.py file that contains all Python MLlib unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── mllib │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_feature.py │ │ ├── test_linalg.py │ │ ├── test_stat.py │ │ ├── test_streaming_algorithms.py │ │ └── test_util.py ... ├── testing ... │ ├── mllibutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-mllib` to verify all passing with Python 2.7 and Python 3.6. Also installed scipy to include optional tests in test_linalg. Closes #23056 from BryanCutler/python-test-breakup-mllib-SPARK-26034. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-16 11:12:17 -05:00			`except ImportError:`
			`testRunner = None`
			`unittest.main(testRunner=testRunner, verbosity=2)`