spark-instrumented-optimizer/python/pyspark/mllib/tests/test_util.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import tempfile
import unittest

from pyspark.mllib.common import _to_java_object_rdd
from pyspark.mllib.util import LinearDataGenerator
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import SparseVector, DenseVector, Vectors
from pyspark.mllib.random import RandomRDDs
from pyspark.testing.mllibutils import MLlibTestCase


class MLUtilsTests(MLlibTestCase):
    def test_append_bias(self):
        data = [2.0, 2.0, 2.0]
        ret = MLUtils.appendBias(data)
        self.assertEqual(ret[3], 1.0)
        self.assertEqual(type(ret), DenseVector)

    def test_append_bias_with_vector(self):
        data = Vectors.dense([2.0, 2.0, 2.0])
        ret = MLUtils.appendBias(data)
        self.assertEqual(ret[3], 1.0)
        self.assertEqual(type(ret), DenseVector)

    def test_append_bias_with_sp_vector(self):
        data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
        expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
        # Returned value must be SparseVector
        ret = MLUtils.appendBias(data)
        self.assertEqual(ret, expected)
        self.assertEqual(type(ret), SparseVector)

    def test_load_vectors(self):
        import shutil
        data = [
            [1.0, 2.0, 3.0],
            [1.0, 2.0, 3.0]
        ]
        temp_dir = tempfile.mkdtemp()
        load_vectors_path = os.path.join(temp_dir, "test_load_vectors")
        try:
            self.sc.parallelize(data).saveAsTextFile(load_vectors_path)
            ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)
            ret = ret_rdd.collect()
            self.assertEqual(len(ret), 2)
            self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
            self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
        except:
            self.fail()
        finally:
            shutil.rmtree(load_vectors_path)


class LinearDataGeneratorTests(MLlibTestCase):
    def test_dim(self):
        linear_data = LinearDataGenerator.generateLinearInput(
            intercept=0.0, weights=[0.0, 0.0, 0.0],
            xMean=[0.0, 0.0, 0.0], xVariance=[0.33, 0.33, 0.33],
            nPoints=4, seed=0, eps=0.1)
        self.assertEqual(len(linear_data), 4)
        for point in linear_data:
            self.assertEqual(len(point.features), 3)

        linear_data = LinearDataGenerator.generateLinearRDD(
            sc=self.sc, nexamples=6, nfeatures=2, eps=0.1,
            nParts=2, intercept=0.0).collect()
        self.assertEqual(len(linear_data), 6)
        for point in linear_data:
            self.assertEqual(len(point.features), 2)


class SerDeTest(MLlibTestCase):
    def test_to_java_object_rdd(self):  # SPARK-6660
        data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)
        self.assertEqual(_to_java_object_rdd(data).count(), 10)


if __name__ == "__main__":
    from pyspark.mllib.tests.test_util import *

    try:
        import xmlrunner
        testRunner = xmlrunner.XMLTestRunner(output='target/test-reports')
    except ImportError:
        testRunner = None
    unittest.main(testRunner=testRunner, verbosity=2)
[SPARK-26034][PYTHON][TESTS] Break large mllib/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large mllib/tests.py file that contains all Python MLlib unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── mllib │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_feature.py │ │ ├── test_linalg.py │ │ ├── test_stat.py │ │ ├── test_streaming_algorithms.py │ │ └── test_util.py ... ├── testing ... │ ├── mllibutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-mllib` to verify all passing with Python 2.7 and Python 3.6. Also installed scipy to include optional tests in test_linalg. Closes #23056 from BryanCutler/python-test-breakup-mllib-SPARK-26034. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-16 11:12:17 -05:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

			`import os`
			`import tempfile`
[SPARK-26105][PYTHON] Clean unittest2 imports up that were added for Python 2.6 before ## What changes were proposed in this pull request? Currently, some of PySpark tests sill assume the tests could be ran in Python 2.6 by importing `unittest2`. For instance: ```python if sys.version_info[:2] <= (2, 6): try: import unittest2 as unittest except ImportError: sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') sys.exit(1) else: import unittest ``` While I am here, I removed some of unused imports and reordered imports per PEP 8. We officially dropped Python 2.6 support a while ago and started to discuss about Python 2 drop. It's better to remove them out. ## How was this patch tested? Manually tests, and existing tests via Jenkins. Closes #23077 from HyukjinKwon/SPARK-26105. Lead-authored-by: hyukjinkwon <gurwls223@apache.org> Co-authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-18 20:22:32 -05:00			`import unittest`
[SPARK-26034][PYTHON][TESTS] Break large mllib/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large mllib/tests.py file that contains all Python MLlib unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── mllib │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_feature.py │ │ ├── test_linalg.py │ │ ├── test_stat.py │ │ ├── test_streaming_algorithms.py │ │ └── test_util.py ... ├── testing ... │ ├── mllibutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-mllib` to verify all passing with Python 2.7 and Python 3.6. Also installed scipy to include optional tests in test_linalg. Closes #23056 from BryanCutler/python-test-breakup-mllib-SPARK-26034. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-16 11:12:17 -05:00
			`from pyspark.mllib.common import _to_java_object_rdd`
			`from pyspark.mllib.util import LinearDataGenerator`
			`from pyspark.mllib.util import MLUtils`
[SPARK-26105][PYTHON] Clean unittest2 imports up that were added for Python 2.6 before ## What changes were proposed in this pull request? Currently, some of PySpark tests sill assume the tests could be ran in Python 2.6 by importing `unittest2`. For instance: ```python if sys.version_info[:2] <= (2, 6): try: import unittest2 as unittest except ImportError: sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier') sys.exit(1) else: import unittest ``` While I am here, I removed some of unused imports and reordered imports per PEP 8. We officially dropped Python 2.6 support a while ago and started to discuss about Python 2 drop. It's better to remove them out. ## How was this patch tested? Manually tests, and existing tests via Jenkins. Closes #23077 from HyukjinKwon/SPARK-26105. Lead-authored-by: hyukjinkwon <gurwls223@apache.org> Co-authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-18 20:22:32 -05:00			`from pyspark.mllib.linalg import SparseVector, DenseVector, Vectors`
[SPARK-26034][PYTHON][TESTS] Break large mllib/tests.py file into smaller files ## What changes were proposed in this pull request? This PR breaks down the large mllib/tests.py file that contains all Python MLlib unit tests into several smaller test files to be easier to read and maintain. The tests are broken down as follows: ``` pyspark ├── __init__.py ... ├── mllib │ ├── __init__.py ... │ ├── tests │ │ ├── __init__.py │ │ ├── test_algorithms.py │ │ ├── test_feature.py │ │ ├── test_linalg.py │ │ ├── test_stat.py │ │ ├── test_streaming_algorithms.py │ │ └── test_util.py ... ├── testing ... │ ├── mllibutils.py ... ``` ## How was this patch tested? Ran tests manually by module to ensure test count was the same, and ran `python/run-tests --modules=pyspark-mllib` to verify all passing with Python 2.7 and Python 3.6. Also installed scipy to include optional tests in test_linalg. Closes #23056 from BryanCutler/python-test-breakup-mllib-SPARK-26034. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org> 2018-11-16 11:12:17 -05:00			`from pyspark.mllib.random import RandomRDDs`
			`from pyspark.testing.mllibutils import MLlibTestCase`


			`class MLUtilsTests(MLlibTestCase):`
			`def test_append_bias(self):`
			`data = [2.0, 2.0, 2.0]`
			`ret = MLUtils.appendBias(data)`
			`self.assertEqual(ret[3], 1.0)`
			`self.assertEqual(type(ret), DenseVector)`

			`def test_append_bias_with_vector(self):`
			`data = Vectors.dense([2.0, 2.0, 2.0])`
			`ret = MLUtils.appendBias(data)`
			`self.assertEqual(ret[3], 1.0)`
			`self.assertEqual(type(ret), DenseVector)`

			`def test_append_bias_with_sp_vector(self):`
			`data = Vectors.sparse(3, {0: 2.0, 2: 2.0})`
			`expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})`
			`# Returned value must be SparseVector`
			`ret = MLUtils.appendBias(data)`
			`self.assertEqual(ret, expected)`
			`self.assertEqual(type(ret), SparseVector)`

			`def test_load_vectors(self):`
			`import shutil`
			`data = [`
			`[1.0, 2.0, 3.0],`
			`[1.0, 2.0, 3.0]`
			`]`
			`temp_dir = tempfile.mkdtemp()`
			`load_vectors_path = os.path.join(temp_dir, "test_load_vectors")`
			`try:`
			`self.sc.parallelize(data).saveAsTextFile(load_vectors_path)`
			`ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)`
			`ret = ret_rdd.collect()`
			`self.assertEqual(len(ret), 2)`
			`self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))`
			`self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))`
			`except:`
			`self.fail()`
			`finally:`
			`shutil.rmtree(load_vectors_path)`


			`class LinearDataGeneratorTests(MLlibTestCase):`
			`def test_dim(self):`
			`linear_data = LinearDataGenerator.generateLinearInput(`
			`intercept=0.0, weights=[0.0, 0.0, 0.0],`
			`xMean=[0.0, 0.0, 0.0], xVariance=[0.33, 0.33, 0.33],`
			`nPoints=4, seed=0, eps=0.1)`
			`self.assertEqual(len(linear_data), 4)`
			`for point in linear_data:`
			`self.assertEqual(len(point.features), 3)`

			`linear_data = LinearDataGenerator.generateLinearRDD(`
			`sc=self.sc, nexamples=6, nfeatures=2, eps=0.1,`
			`nParts=2, intercept=0.0).collect()`
			`self.assertEqual(len(linear_data), 6)`
			`for point in linear_data:`
			`self.assertEqual(len(point.features), 2)`


			`class SerDeTest(MLlibTestCase):`
			`def test_to_java_object_rdd(self): # SPARK-6660`
			`data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)`
			`self.assertEqual(_to_java_object_rdd(data).count(), 10)`


			`if __name__ == "__main__":`
			`from pyspark.mllib.tests.test_util import *`

			`try:`
			`import xmlrunner`
			`testRunner = xmlrunner.XMLTestRunner(output='target/test-reports')`
			`except ImportError:`
			`testRunner = None`
			`unittest.main(testRunner=testRunner, verbosity=2)`