spark-instrumented-optimizer/python/pyspark/worker.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Worker that receives input from Piped RDD.
"""
import os
import sys
import time
import socket
import traceback
# CloudPickler needs to be imported so that depicklers are registered using the
# copy_reg module.
from pyspark.accumulators import _accumulatorRegistry
from pyspark.broadcast import Broadcast, _broadcastRegistry
from pyspark.cloudpickle import CloudPickler
from pyspark.files import SparkFiles
from pyspark.serializers import write_with_length, write_int, read_long, \
    write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer


pickleSer = PickleSerializer()
utf8_deserializer = UTF8Deserializer()


def report_times(outfile, boot, init, finish):
    write_int(SpecialLengths.TIMING_DATA, outfile)
    write_long(1000 * boot, outfile)
    write_long(1000 * init, outfile)
    write_long(1000 * finish, outfile)


def main(infile, outfile):
    try:
        boot_time = time.time()
        split_index = read_int(infile)
        if split_index == -1:  # for unit tests
            return

        # fetch name of workdir
        spark_files_dir = utf8_deserializer.loads(infile)
        SparkFiles._root_directory = spark_files_dir
        SparkFiles._is_running_on_worker = True

        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
        sys.path.append(spark_files_dir)  # *.py files that were added will be copied here
        num_python_includes = read_int(infile)
        for _ in range(num_python_includes):
            filename = utf8_deserializer.loads(infile)
            sys.path.append(os.path.join(spark_files_dir, filename))

        # fetch names and values of broadcast variables
        num_broadcast_variables = read_int(infile)
        for _ in range(num_broadcast_variables):
            bid = read_long(infile)
            value = pickleSer._read_with_length(infile)
            _broadcastRegistry[bid] = Broadcast(bid, value)

        command = pickleSer._read_with_length(infile)
        (func, deserializer, serializer) = command
        init_time = time.time()
        iterator = deserializer.load_stream(infile)
        serializer.dump_stream(func(split_index, iterator), outfile)
    except Exception:
        try:
            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(traceback.format_exc(), outfile)
            outfile.flush()
        except IOError:
            # JVM close the socket
            pass
        except Exception:
            # Write the error to stderr if it happened while serializing
            print >> sys.stderr, "PySpark worker failed with exception:"
            print >> sys.stderr, traceback.format_exc()
        exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    write_int(len(_accumulatorRegistry), outfile)
    for (aid, accum) in _accumulatorRegistry.items():
        pickleSer._write_with_length((aid, accum._value), outfile)


if __name__ == '__main__':
    # Read a local port to connect to from stdin
    java_port = int(sys.stdin.readline())
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.connect(("127.0.0.1", java_port))
    sock_file = sock.makefile("a+", 65536)
    main(sock_file, sock_file)
Add Apache license headers and LICENSE and NOTICE files 2013-07-16 20:21:33 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

Add Python API. 2012-08-10 04:10:02 -04:00			`"""`
			`Worker that receives input from Piped RDD.`
			`"""`
Fix stdout redirection in PySpark. 2013-02-01 03:25:19 -05:00			`import os`
Add Python API. 2012-08-10 04:10:02 -04:00			`import sys`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`import time`
Allow PySpark to launch worker.py directly on Windows 2013-09-01 21:06:15 -04:00			`import socket`
SPARK-673: Capture and re-throw Python exceptions This patch alters the Python <-> executor protocol to pass on exception data when they occur in user Python code. 2013-01-31 21:02:28 -05:00			`import traceback`
Add Python API. 2012-08-10 04:10:02 -04:00			`# CloudPickler needs to be imported so that depicklers are registered using the`
			`# copy_reg module.`
Added accumulators to PySpark 2013-01-20 04:57:44 -05:00			`from pyspark.accumulators import _accumulatorRegistry`
Add broadcast variables to Python API. 2012-08-25 16:59:01 -04:00			`from pyspark.broadcast import Broadcast, _broadcastRegistry`
Bundle cloudpickle with pyspark. 2012-08-19 20:12:51 -04:00			`from pyspark.cloudpickle import CloudPickler`
Don't download files to master's working directory. This should avoid exceptions caused by existing files with different contents. I also removed some unused code. 2013-01-21 19:42:24 -05:00			`from pyspark.files import SparkFiles`
Add custom serializer support to PySpark. For now, this only adds MarshalSerializer, but it lays the groundwork for other supporting custom serializers. Many of these mechanisms can also be used to support deserialization of different data formats sent by Java, such as data encoded by MsgPack. This also fixes a bug in SparkContext.union(). 2013-11-05 20:52:39 -05:00			`from pyspark.serializers import write_with_length, write_int, read_long, \`
Switch from MUTF8 to UTF8 in PySpark serializers. This fixes SPARK-1043, a bug introduced in 0.9.0 where PySpark couldn't serialize strings > 64kB. This fix was written by @tyro89 and @bouk in #512. This commit squashes and rebases their pull request in order to fix some merge conflicts. 2014-01-28 22:50:26 -05:00			`write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer`
Add custom serializer support to PySpark. For now, this only adds MarshalSerializer, but it lays the groundwork for other supporting custom serializers. Many of these mechanisms can also be used to support deserialization of different data formats sent by Java, such as data encoded by MsgPack. This also fixes a bug in SparkContext.union(). 2013-11-05 20:52:39 -05:00

			`pickleSer = PickleSerializer()`
Switch from MUTF8 to UTF8 in PySpark serializers. This fixes SPARK-1043, a bug introduced in 0.9.0 where PySpark couldn't serialize strings > 64kB. This fix was written by @tyro89 and @bouk in #512. This commit squashes and rebases their pull request in order to fix some merge conflicts. 2014-01-28 22:50:26 -05:00			`utf8_deserializer = UTF8Deserializer()`
Simplify Python worker; pipeline the map step of partitionBy(). 2012-08-25 19:46:07 -04:00
Add Python API. 2012-08-10 04:10:02 -04:00
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`def report_times(outfile, boot, init, finish):`
Replace magic lengths with constants in PySpark. Write the length of the accumulators section up-front rather than terminating it with a negative length. I find this easier to read. 2013-11-03 00:13:18 -04:00			`write_int(SpecialLengths.TIMING_DATA, outfile)`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`write_long(1000 * boot, outfile)`
			`write_long(1000 * init, outfile)`
			`write_long(1000 * finish, outfile)`
Add Python timing instrumentation 2013-03-10 16:54:46 -04:00

Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`def main(infile, outfile):`
SPARK-1115: Catch depickling errors This surroungs the complete worker code in a try/except block so we catch any error that arrives. An example would be the depickling failing for some reason @JoshRosen Author: Bouke van der Bijl <boukevanderbijl@gmail.com> Closes #644 from bouk/catch-depickling-errors and squashes the following commits: f0f67cc [Bouke van der Bijl] Lol indentation 0e4d504 [Bouke van der Bijl] Surround the complete python worker with the try block 2014-02-26 17:50:37 -05:00			`try:`
			`boot_time = time.time()`
			`split_index = read_int(infile)`
			`if split_index == -1: # for unit tests`
			`return`
Implementing SPARK-878 for PySpark: adding zip and egg files to context and passing it down to workers which add these to their sys.path 2013-08-15 19:01:19 -04:00
SPARK-1115: Catch depickling errors This surroungs the complete worker code in a try/except block so we catch any error that arrives. An example would be the depickling failing for some reason @JoshRosen Author: Bouke van der Bijl <boukevanderbijl@gmail.com> Closes #644 from bouk/catch-depickling-errors and squashes the following commits: f0f67cc [Bouke van der Bijl] Lol indentation 0e4d504 [Bouke van der Bijl] Surround the complete python worker with the try block 2014-02-26 17:50:37 -05:00			`# fetch name of workdir`
			`spark_files_dir = utf8_deserializer.loads(infile)`
			`SparkFiles._root_directory = spark_files_dir`
			`SparkFiles._is_running_on_worker = True`
Implementing SPARK-878 for PySpark: adding zip and egg files to context and passing it down to workers which add these to their sys.path 2013-08-15 19:01:19 -04:00
SPARK-1115: Catch depickling errors This surroungs the complete worker code in a try/except block so we catch any error that arrives. An example would be the depickling failing for some reason @JoshRosen Author: Bouke van der Bijl <boukevanderbijl@gmail.com> Closes #644 from bouk/catch-depickling-errors and squashes the following commits: f0f67cc [Bouke van der Bijl] Lol indentation 0e4d504 [Bouke van der Bijl] Surround the complete python worker with the try block 2014-02-26 17:50:37 -05:00			`# fetch names of includes (.zip and .egg files) and construct PYTHONPATH`
[SPARK-2470] PEP8 fixes to PySpark This pull request aims to resolve all outstanding PEP8 violations in PySpark. Author: Nicholas Chammas <nicholas.chammas@gmail.com> Author: nchammas <nicholas.chammas@gmail.com> Closes #1505 from nchammas/master and squashes the following commits: 98171af [Nicholas Chammas] [SPARK-2470] revert PEP 8 fixes to cloudpickle cba7768 [Nicholas Chammas] [SPARK-2470] wrap expression list in parentheses e178dbe [Nicholas Chammas] [SPARK-2470] style - change position of line break 9127d2b [Nicholas Chammas] [SPARK-2470] wrap expression lists in parentheses 22132a4 [Nicholas Chammas] [SPARK-2470] wrap conditionals in parentheses 24639bc [Nicholas Chammas] [SPARK-2470] fix whitespace for doctest 7d557b7 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to tests.py 8f8e4c0 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to storagelevel.py b3b96cf [Nicholas Chammas] [SPARK-2470] PEP8 fixes to statcounter.py d644477 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to worker.py aa3a7b6 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to sql.py 1916859 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to shell.py 95d1d95 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to serializers.py a0fec2e [Nicholas Chammas] [SPARK-2470] PEP8 fixes to mllib c85e1e5 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to join.py d14f2f1 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to __init__.py 81fcb20 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to resultiterable.py 1bde265 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to java_gateway.py 7fc849c [Nicholas Chammas] [SPARK-2470] PEP8 fixes to daemon.py ca2d28b [Nicholas Chammas] [SPARK-2470] PEP8 fixes to context.py f4e0039 [Nicholas Chammas] [SPARK-2470] PEP8 fixes to conf.py a6d5e4b [Nicholas Chammas] [SPARK-2470] PEP8 fixes to cloudpickle.py f0a7ebf [Nicholas Chammas] [SPARK-2470] PEP8 fixes to rddsampler.py 4dd148f [nchammas] Merge pull request #5 from apache/master f7e4581 [Nicholas Chammas] unrelated pep8 fix a36eed0 [Nicholas Chammas] name ec2 instances and security groups consistently de7292a [nchammas] Merge pull request #4 from apache/master 2e4fe00 [nchammas] Merge pull request #3 from apache/master 89fde08 [nchammas] Merge pull request #2 from apache/master 69f6e22 [Nicholas Chammas] PEP8 fixes 2627247 [Nicholas Chammas] broke up lines before they hit 100 chars 6544b7e [Nicholas Chammas] [SPARK-2065] give launched instances names 69da6cf [nchammas] Merge pull request #1 from apache/master 2014-07-22 01:30:53 -04:00			`sys.path.append(spark_files_dir) # *.py files that were added will be copied here`
			`num_python_includes = read_int(infile)`
SPARK-1115: Catch depickling errors This surroungs the complete worker code in a try/except block so we catch any error that arrives. An example would be the depickling failing for some reason @JoshRosen Author: Bouke van der Bijl <boukevanderbijl@gmail.com> Closes #644 from bouk/catch-depickling-errors and squashes the following commits: f0f67cc [Bouke van der Bijl] Lol indentation 0e4d504 [Bouke van der Bijl] Surround the complete python worker with the try block 2014-02-26 17:50:37 -05:00			`for _ in range(num_python_includes):`
			`filename = utf8_deserializer.loads(infile)`
			`sys.path.append(os.path.join(spark_files_dir, filename))`
Implementing SPARK-878 for PySpark: adding zip and egg files to context and passing it down to workers which add these to their sys.path 2013-08-15 19:01:19 -04:00
Add Python includes to path before depickling broadcast values This fixes https://issues.apache.org/jira/browse/SPARK-1731 by adding the Python includes to the PYTHONPATH before depickling the broadcast values @airhorns Author: Bouke van der Bijl <boukevanderbijl@gmail.com> Closes #656 from bouk/python-includes-before-broadcast and squashes the following commits: 7b0dfe4 [Bouke van der Bijl] Add Python includes to path before depickling broadcast values 2014-05-10 16:02:13 -04:00			`# fetch names and values of broadcast variables`
			`num_broadcast_variables = read_int(infile)`
			`for _ in range(num_broadcast_variables):`
			`bid = read_long(infile)`
			`value = pickleSer._read_with_length(infile)`
			`_broadcastRegistry[bid] = Broadcast(bid, value)`

SPARK-1115: Catch depickling errors This surroungs the complete worker code in a try/except block so we catch any error that arrives. An example would be the depickling failing for some reason @JoshRosen Author: Bouke van der Bijl <boukevanderbijl@gmail.com> Closes #644 from bouk/catch-depickling-errors and squashes the following commits: f0f67cc [Bouke van der Bijl] Lol indentation 0e4d504 [Bouke van der Bijl] Surround the complete python worker with the try block 2014-02-26 17:50:37 -05:00			`command = pickleSer._read_with_length(infile)`
			`(func, deserializer, serializer) = command`
			`init_time = time.time()`
Add custom serializer support to PySpark. For now, this only adds MarshalSerializer, but it lays the groundwork for other supporting custom serializers. Many of these mechanisms can also be used to support deserialization of different data formats sent by Java, such as data encoded by MsgPack. This also fixes a bug in SparkContext.union(). 2013-11-05 20:52:39 -05:00			`iterator = deserializer.load_stream(infile)`
			`serializer.dump_stream(func(split_index, iterator), outfile)`
[SPARK-2580] [PySpark] keep silent in worker if JVM close the socket During rdd.take(n), JVM will close the socket if it had got enough data, the Python worker should keep silent in this case. In the same time, the worker should not print the trackback into stderr if it send the traceback to JVM successfully. Author: Davies Liu <davies.liu@gmail.com> Closes #1625 from davies/error and squashes the following commits: 4fbcc6d [Davies Liu] disable log4j during testing when exception is expected. cc14202 [Davies Liu] keep silent in worker if JVM close the socket 2014-07-29 03:15:45 -04:00			`except Exception:`
			`try:`
			`write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)`
			`write_with_length(traceback.format_exc(), outfile)`
			`outfile.flush()`
			`except IOError:`
			`# JVM close the socket`
			`pass`
			`except Exception:`
			`# Write the error to stderr if it happened while serializing`
			`print >> sys.stderr, "PySpark worker failed with exception:"`
			`print >> sys.stderr, traceback.format_exc()`
			`exit(-1)`
Add Python timing instrumentation 2013-03-10 16:54:46 -04:00			`finish_time = time.time()`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`report_times(outfile, boot_time, init_time, finish_time)`
Added accumulators to PySpark 2013-01-20 04:57:44 -05:00			`# Mark the beginning of the accumulators section of the output`
Replace magic lengths with constants in PySpark. Write the length of the accumulators section up-front rather than terminating it with a negative length. I find this easier to read. 2013-11-03 00:13:18 -04:00			`write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)`
			`write_int(len(_accumulatorRegistry), outfile)`
			`for (aid, accum) in _accumulatorRegistry.items():`
Add custom serializer support to PySpark. For now, this only adds MarshalSerializer, but it lays the groundwork for other supporting custom serializers. Many of these mechanisms can also be used to support deserialization of different data formats sent by Java, such as data encoded by MsgPack. This also fixes a bug in SparkContext.union(). 2013-11-05 20:52:39 -05:00			`pickleSer._write_with_length((aid, accum._value), outfile)`
Add Python API. 2012-08-10 04:10:02 -04:00

			`if __name__ == '__main__':`
Allow PySpark to launch worker.py directly on Windows 2013-09-01 21:06:15 -04:00			`# Read a local port to connect to from stdin`
			`java_port = int(sys.stdin.readline())`
			`sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)`
			`sock.connect(("127.0.0.1", java_port))`
			`sock_file = sock.makefile("a+", 65536)`
			`main(sock_file, sock_file)`