spark-instrumented-optimizer/python/pyspark/tests.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Unit tests for PySpark; additional tests are implemented as doctests in
individual modules.
"""
from fileinput import input
from glob import glob
import os
import shutil
import sys
from tempfile import NamedTemporaryFile
import time
import unittest

from pyspark.context import SparkContext
from pyspark.files import SparkFiles
from pyspark.java_gateway import SPARK_HOME
from pyspark.serializers import read_int


class PySparkTestCase(unittest.TestCase):

    def setUp(self):
        self._old_sys_path = list(sys.path)
        class_name = self.__class__.__name__
        self.sc = SparkContext('local[4]', class_name , batchSize=2)

    def tearDown(self):
        self.sc.stop()
        sys.path = self._old_sys_path
        # To avoid Akka rebinding to the same port, since it doesn't unbind
        # immediately on shutdown
        self.sc._jvm.System.clearProperty("spark.driver.port")


class TestCheckpoint(PySparkTestCase):

    def setUp(self):
        PySparkTestCase.setUp(self)
        self.checkpointDir = NamedTemporaryFile(delete=False)
        os.unlink(self.checkpointDir.name)
        self.sc.setCheckpointDir(self.checkpointDir.name)

    def tearDown(self):
        PySparkTestCase.tearDown(self)
        shutil.rmtree(self.checkpointDir.name)

    def test_basic_checkpointing(self):
        parCollection = self.sc.parallelize([1, 2, 3, 4])
        flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1))

        self.assertFalse(flatMappedRDD.isCheckpointed())
        self.assertTrue(flatMappedRDD.getCheckpointFile() is None)

        flatMappedRDD.checkpoint()
        result = flatMappedRDD.collect()
        time.sleep(1)  # 1 second
        self.assertTrue(flatMappedRDD.isCheckpointed())
        self.assertEqual(flatMappedRDD.collect(), result)
        self.assertEqual("file:" + self.checkpointDir.name,
                         os.path.dirname(os.path.dirname(flatMappedRDD.getCheckpointFile())))

    def test_checkpoint_and_restore(self):
        parCollection = self.sc.parallelize([1, 2, 3, 4])
        flatMappedRDD = parCollection.flatMap(lambda x: [x])

        self.assertFalse(flatMappedRDD.isCheckpointed())
        self.assertTrue(flatMappedRDD.getCheckpointFile() is None)

        flatMappedRDD.checkpoint()
        flatMappedRDD.count()  # forces a checkpoint to be computed
        time.sleep(1)  # 1 second

        self.assertTrue(flatMappedRDD.getCheckpointFile() is not None)
        recovered = self.sc._checkpointFile(flatMappedRDD.getCheckpointFile(),
                                            flatMappedRDD._jrdd_deserializer)
        self.assertEquals([1, 2, 3, 4], recovered.collect())


class TestAddFile(PySparkTestCase):

    def test_add_py_file(self):
        # To ensure that we're actually testing addPyFile's effects, check that
        # this job fails due to `userlibrary` not being on the Python path:
        def func(x):
            from userlibrary import UserClass
            return UserClass().hello()
        self.assertRaises(Exception,
                          self.sc.parallelize(range(2)).map(func).first)
        # Add the file, so the job should now succeed:
        path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py")
        self.sc.addPyFile(path)
        res = self.sc.parallelize(range(2)).map(func).first()
        self.assertEqual("Hello World!", res)

    def test_add_file_locally(self):
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        self.sc.addFile(path)
        download_path = SparkFiles.get("hello.txt")
        self.assertNotEqual(path, download_path)
        with open(download_path) as test_file:
            self.assertEquals("Hello World!\n", test_file.readline())

    def test_add_py_file_locally(self):
        # To ensure that we're actually testing addPyFile's effects, check that
        # this fails due to `userlibrary` not being on the Python path:
        def func():
            from userlibrary import UserClass
        self.assertRaises(ImportError, func)
        path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py")
        self.sc.addFile(path)
        from userlibrary import UserClass
        self.assertEqual("Hello World!", UserClass().hello())

    def test_add_egg_file_locally(self):
        # To ensure that we're actually testing addPyFile's effects, check that
        # this fails due to `userlibrary` not being on the Python path:
        def func():
            from userlib import UserClass
        self.assertRaises(ImportError, func)
        path = os.path.join(SPARK_HOME, "python/test_support/userlib-0.1-py2.7.egg")
        self.sc.addPyFile(path)
        from userlib import UserClass
        self.assertEqual("Hello World from inside a package!", UserClass().hello())


class TestRDDFunctions(PySparkTestCase):

    def test_save_as_textfile_with_unicode(self):
        # Regression test for SPARK-970
        x = u"\u00A1Hola, mundo!"
        data = self.sc.parallelize([x])
        tempFile = NamedTemporaryFile(delete=True)
        tempFile.close()
        data.saveAsTextFile(tempFile.name)
        raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
        self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

    def test_transforming_cartesian_result(self):
        # Regression test for SPARK-1034
        rdd1 = self.sc.parallelize([1, 2])
        rdd2 = self.sc.parallelize([3, 4])
        cart = rdd1.cartesian(rdd2)
        result = cart.map(lambda (x, y): x + y).collect()

    def test_cartesian_on_textfile(self):
        # Regression test for
        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
        a = self.sc.textFile(path)
        result = a.cartesian(a).collect()
        (x, y) = result[0]
        self.assertEqual("Hello World!", x.strip())
        self.assertEqual("Hello World!", y.strip())


class TestIO(PySparkTestCase):

    def test_stdout_redirection(self):
        import subprocess
        def func(x):
            subprocess.check_call('ls', shell=True)
        self.sc.parallelize([1]).foreach(func)


class TestDaemon(unittest.TestCase):
    def connect(self, port):
        from socket import socket, AF_INET, SOCK_STREAM
        sock = socket(AF_INET, SOCK_STREAM)
        sock.connect(('127.0.0.1', port))
        # send a split index of -1 to shutdown the worker
        sock.send("\xFF\xFF\xFF\xFF")
        sock.close()
        return True

    def do_termination_test(self, terminator):
        from subprocess import Popen, PIPE
        from errno import ECONNREFUSED

        # start daemon
        daemon_path = os.path.join(os.path.dirname(__file__), "daemon.py")
        daemon = Popen([sys.executable, daemon_path], stdin=PIPE, stdout=PIPE)

        # read the port number
        port = read_int(daemon.stdout)

        # daemon should accept connections
        self.assertTrue(self.connect(port))

        # request shutdown
        terminator(daemon)
        time.sleep(1)

        # daemon should no longer accept connections
        try:
            self.connect(port)
        except EnvironmentError as exception:
            self.assertEqual(exception.errno, ECONNREFUSED)
        else:
            self.fail("Expected EnvironmentError to be raised")

    def test_termination_stdin(self):
        """Ensure that daemon and workers terminate when stdin is closed."""
        self.do_termination_test(lambda daemon: daemon.stdin.close())

    def test_termination_sigterm(self):
        """Ensure that daemon and workers terminate on SIGTERM."""
        from signal import SIGTERM
        self.do_termination_test(lambda daemon: os.kill(daemon.pid, SIGTERM))

if __name__ == "__main__":
    unittest.main()
Add Apache license headers and LICENSE and NOTICE files 2013-07-16 20:21:33 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00			`"""`
			`Unit tests for PySpark; additional tests are implemented as doctests in`
			`individual modules.`
			`"""`
Fix UnicodeEncodeError in PySpark saveAsTextFile(). Fixes SPARK-970. 2013-11-29 02:44:56 -05:00			`from fileinput import input`
			`from glob import glob`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00			`import os`
			`import shutil`
Allow PySpark's SparkFiles to be used from driver Fix minor documentation formatting issues. 2013-01-23 13:36:18 -05:00			`import sys`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00			`from tempfile import NamedTemporaryFile`
			`import time`
			`import unittest`

			`from pyspark.context import SparkContext`
Allow PySpark's SparkFiles to be used from driver Fix minor documentation formatting issues. 2013-01-23 13:36:18 -05:00			`from pyspark.files import SparkFiles`
Fix sys.path bug in PySpark SparkContext.addPyFile 2013-01-22 20:54:11 -05:00			`from pyspark.java_gateway import SPARK_HOME`
Add tests and fixes for Python daemon shutdown 2013-05-10 18:48:48 -04:00			`from pyspark.serializers import read_int`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00

Fix sys.path bug in PySpark SparkContext.addPyFile 2013-01-22 20:54:11 -05:00			`class PySparkTestCase(unittest.TestCase):`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00
			`def setUp(self):`
Allow PySpark's SparkFiles to be used from driver Fix minor documentation formatting issues. 2013-01-23 13:36:18 -05:00			`self._old_sys_path = list(sys.path)`
Fix sys.path bug in PySpark SparkContext.addPyFile 2013-01-22 20:54:11 -05:00			`class_name = self.__class__.__name__`
			`self.sc = SparkContext('local[4]', class_name , batchSize=2)`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00
			`def tearDown(self):`
			`self.sc.stop()`
Allow PySpark's SparkFiles to be used from driver Fix minor documentation formatting issues. 2013-01-23 13:36:18 -05:00			`sys.path = self._old_sys_path`
Add checkpointFile() and more tests to PySpark. 2013-01-20 16:59:45 -05:00			`# To avoid Akka rebinding to the same port, since it doesn't unbind`
			`# immediately on shutdown`
Do not launch JavaGateways on workers (SPARK-674). The problem was that the gateway was being initialized whenever the pyspark.context module was loaded. The fix uses lazy initialization that occurs only when SparkContext instances are actually constructed. I also made the gateway and jvm variables private. This change results in ~3-4x performance improvement when running the PySpark unit tests. 2013-02-01 14:09:56 -05:00			`self.sc._jvm.System.clearProperty("spark.driver.port")`
Fix sys.path bug in PySpark SparkContext.addPyFile 2013-01-22 20:54:11 -05:00

			`class TestCheckpoint(PySparkTestCase):`

			`def setUp(self):`
			`PySparkTestCase.setUp(self)`
			`self.checkpointDir = NamedTemporaryFile(delete=False)`
			`os.unlink(self.checkpointDir.name)`
			`self.sc.setCheckpointDir(self.checkpointDir.name)`

			`def tearDown(self):`
			`PySparkTestCase.tearDown(self)`
Clean up setup code in PySpark checkpointing tests 2013-01-20 18:38:11 -05:00			`shutil.rmtree(self.checkpointDir.name)`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00
			`def test_basic_checkpointing(self):`
			`parCollection = self.sc.parallelize([1, 2, 3, 4])`
			`flatMappedRDD = parCollection.flatMap(lambda x: range(1, x + 1))`

			`self.assertFalse(flatMappedRDD.isCheckpointed())`
Fix PySpark unit tests on Python 2.6. 2013-08-14 18:12:12 -04:00			`self.assertTrue(flatMappedRDD.getCheckpointFile() is None)`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00
			`flatMappedRDD.checkpoint()`
			`result = flatMappedRDD.collect()`
			`time.sleep(1) # 1 second`
			`self.assertTrue(flatMappedRDD.isCheckpointed())`
			`self.assertEqual(flatMappedRDD.collect(), result)`
Fixed Python API for sc.setCheckpointDir. Also other fixes based on Reynold's comments on PR 289. 2013-12-24 17:01:13 -05:00			`self.assertEqual("file:" + self.checkpointDir.name,`
			`os.path.dirname(os.path.dirname(flatMappedRDD.getCheckpointFile())))`
Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00
Add checkpointFile() and more tests to PySpark. 2013-01-20 16:59:45 -05:00			`def test_checkpoint_and_restore(self):`
			`parCollection = self.sc.parallelize([1, 2, 3, 4])`
			`flatMappedRDD = parCollection.flatMap(lambda x: [x])`

			`self.assertFalse(flatMappedRDD.isCheckpointed())`
Fix PySpark unit tests on Python 2.6. 2013-08-14 18:12:12 -04:00			`self.assertTrue(flatMappedRDD.getCheckpointFile() is None)`
Add checkpointFile() and more tests to PySpark. 2013-01-20 16:59:45 -05:00
			`flatMappedRDD.checkpoint()`
			`flatMappedRDD.count() # forces a checkpoint to be computed`
			`time.sleep(1) # 1 second`

Fix PySpark unit tests on Python 2.6. 2013-08-14 18:12:12 -04:00			`self.assertTrue(flatMappedRDD.getCheckpointFile() is not None)`
Add custom serializer support to PySpark. For now, this only adds MarshalSerializer, but it lays the groundwork for other supporting custom serializers. Many of these mechanisms can also be used to support deserialization of different data formats sent by Java, such as data encoded by MsgPack. This also fixes a bug in SparkContext.union(). 2013-11-05 20:52:39 -05:00			`recovered = self.sc._checkpointFile(flatMappedRDD.getCheckpointFile(),`
			`flatMappedRDD._jrdd_deserializer)`
Add checkpointFile() and more tests to PySpark. 2013-01-20 16:59:45 -05:00			`self.assertEquals([1, 2, 3, 4], recovered.collect())`

Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00
Fix sys.path bug in PySpark SparkContext.addPyFile 2013-01-22 20:54:11 -05:00			`class TestAddFile(PySparkTestCase):`

			`def test_add_py_file(self):`
			`# To ensure that we're actually testing addPyFile's effects, check that`
			# this job fails due to `userlibrary` not being on the Python path:
			`def func(x):`
			`from userlibrary import UserClass`
			`return UserClass().hello()`
			`self.assertRaises(Exception,`
			`self.sc.parallelize(range(2)).map(func).first)`
			`# Add the file, so the job should now succeed:`
			`path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py")`
			`self.sc.addPyFile(path)`
			`res = self.sc.parallelize(range(2)).map(func).first()`
			`self.assertEqual("Hello World!", res)`

Allow PySpark's SparkFiles to be used from driver Fix minor documentation formatting issues. 2013-01-23 13:36:18 -05:00			`def test_add_file_locally(self):`
			`path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")`
			`self.sc.addFile(path)`
			`download_path = SparkFiles.get("hello.txt")`
			`self.assertNotEqual(path, download_path)`
			`with open(download_path) as test_file:`
			`self.assertEquals("Hello World!\n", test_file.readline())`

			`def test_add_py_file_locally(self):`
			`# To ensure that we're actually testing addPyFile's effects, check that`
			# this fails due to `userlibrary` not being on the Python path:
			`def func():`
			`from userlibrary import UserClass`
			`self.assertRaises(ImportError, func)`
			`path = os.path.join(SPARK_HOME, "python/test_support/userlibrary.py")`
			`self.sc.addFile(path)`
			`from userlibrary import UserClass`
			`self.assertEqual("Hello World!", UserClass().hello())`

Implementing SPARK-878 for PySpark: adding zip and egg files to context and passing it down to workers which add these to their sys.path 2013-08-15 19:01:19 -04:00			`def test_add_egg_file_locally(self):`
			`# To ensure that we're actually testing addPyFile's effects, check that`
			# this fails due to `userlibrary` not being on the Python path:
			`def func():`
			`from userlib import UserClass`
			`self.assertRaises(ImportError, func)`
			`path = os.path.join(SPARK_HOME, "python/test_support/userlib-0.1-py2.7.egg")`
			`self.sc.addPyFile(path)`
			`from userlib import UserClass`
			`self.assertEqual("Hello World from inside a package!", UserClass().hello())`

Fix sys.path bug in PySpark SparkContext.addPyFile 2013-01-22 20:54:11 -05:00
Fix UnicodeEncodeError in PySpark saveAsTextFile(). Fixes SPARK-970. 2013-11-29 02:44:56 -05:00			`class TestRDDFunctions(PySparkTestCase):`

			`def test_save_as_textfile_with_unicode(self):`
			`# Regression test for SPARK-970`
			`x = u"\u00A1Hola, mundo!"`
			`data = self.sc.parallelize([x])`
			`tempFile = NamedTemporaryFile(delete=True)`
			`tempFile.close()`
			`data.saveAsTextFile(tempFile.name)`
			`raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))`
			`self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))`

Fix SPARK-1034: Py4JException on PySpark Cartesian Result 2014-01-23 16:05:59 -05:00			`def test_transforming_cartesian_result(self):`
			`# Regression test for SPARK-1034`
			`rdd1 = self.sc.parallelize([1, 2])`
			`rdd2 = self.sc.parallelize([3, 4])`
			`cart = rdd1.cartesian(rdd2)`
			`result = cart.map(lambda (x, y): x + y).collect()`

Fix SPARK-978: ClassCastException in PySpark cartesian. 2014-01-23 18:09:19 -05:00			`def test_cartesian_on_textfile(self):`
			`# Regression test for`
			`path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")`
			`a = self.sc.textFile(path)`
			`result = a.cartesian(a).collect()`
			`(x, y) = result[0]`
			`self.assertEqual("Hello World!", x.strip())`
			`self.assertEqual("Hello World!", y.strip())`

Fix UnicodeEncodeError in PySpark saveAsTextFile(). Fixes SPARK-970. 2013-11-29 02:44:56 -05:00
Fix stdout redirection in PySpark. 2013-02-01 03:25:19 -05:00			`class TestIO(PySparkTestCase):`

			`def test_stdout_redirection(self):`
			`import subprocess`
			`def func(x):`
			`subprocess.check_call('ls', shell=True)`
			`self.sc.parallelize([1]).foreach(func)`


Add tests and fixes for Python daemon shutdown 2013-05-10 18:48:48 -04:00			`class TestDaemon(unittest.TestCase):`
			`def connect(self, port):`
			`from socket import socket, AF_INET, SOCK_STREAM`
			`sock = socket(AF_INET, SOCK_STREAM)`
			`sock.connect(('127.0.0.1', port))`
			`# send a split index of -1 to shutdown the worker`
			`sock.send("\xFF\xFF\xFF\xFF")`
			`sock.close()`
			`return True`

			`def do_termination_test(self, terminator):`
			`from subprocess import Popen, PIPE`
			`from errno import ECONNREFUSED`

			`# start daemon`
			`daemon_path = os.path.join(os.path.dirname(__file__), "daemon.py")`
			`daemon = Popen([sys.executable, daemon_path], stdin=PIPE, stdout=PIPE)`

			`# read the port number`
			`port = read_int(daemon.stdout)`

			`# daemon should accept connections`
			`self.assertTrue(self.connect(port))`

			`# request shutdown`
			`terminator(daemon)`
			`time.sleep(1)`

			`# daemon should no longer accept connections`
Fix PySpark unit tests on Python 2.6. 2013-08-14 18:12:12 -04:00			`try:`
Add tests and fixes for Python daemon shutdown 2013-05-10 18:48:48 -04:00			`self.connect(port)`
Fix PySpark unit tests on Python 2.6. 2013-08-14 18:12:12 -04:00			`except EnvironmentError as exception:`
			`self.assertEqual(exception.errno, ECONNREFUSED)`
			`else:`
			`self.fail("Expected EnvironmentError to be raised")`
Add tests and fixes for Python daemon shutdown 2013-05-10 18:48:48 -04:00
			`def test_termination_stdin(self):`
			`"""Ensure that daemon and workers terminate when stdin is closed."""`
			`self.do_termination_test(lambda daemon: daemon.stdin.close())`

			`def test_termination_sigterm(self):`
			`"""Ensure that daemon and workers terminate on SIGTERM."""`
			`from signal import SIGTERM`
			`self.do_termination_test(lambda daemon: os.kill(daemon.pid, SIGTERM))`

Add RDD checkpointing to Python API. 2013-01-16 22:15:14 -05:00			`if __name__ == "__main__":`
			`unittest.main()`