spark-instrumented-optimizer/python/pyspark/worker.py

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Worker that receives input from Piped RDD.
"""
import os
import sys
import time
import traceback
from base64 import standard_b64decode
# CloudPickler needs to be imported so that depicklers are registered using the
# copy_reg module.
from pyspark.accumulators import _accumulatorRegistry
from pyspark.broadcast import Broadcast, _broadcastRegistry
from pyspark.cloudpickle import CloudPickler
from pyspark.files import SparkFiles
from pyspark.serializers import write_with_length, read_with_length, write_int, \
    read_long, write_long, read_int, dump_pickle, load_pickle, read_from_pickle_file


def load_obj(infile):
    return load_pickle(standard_b64decode(infile.readline().strip()))


def report_times(outfile, boot, init, finish):
    write_int(-3, outfile)
    write_long(1000 * boot, outfile)
    write_long(1000 * init, outfile)
    write_long(1000 * finish, outfile)


def main(infile, outfile):
    boot_time = time.time()
    split_index = read_int(infile)
    if split_index == -1:  # for unit tests
        return
    spark_files_dir = load_pickle(read_with_length(infile))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True
    sys.path.append(spark_files_dir)
    num_broadcast_variables = read_int(infile)
    for _ in range(num_broadcast_variables):
        bid = read_long(infile)
        value = read_with_length(infile)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))
    func = load_obj(infile)
    bypassSerializer = load_obj(infile)
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    init_time = time.time()
    iterator = read_from_pickle_file(infile)
    try:
        for obj in func(split_index, iterator):
            write_with_length(dumps(obj), outfile)
    except Exception as e:
        write_int(-2, outfile)
        write_with_length(traceback.format_exc(), outfile)
        sys.exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, outfile)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), outfile)
    write_int(-1, outfile)


if __name__ == '__main__':
    # Redirect stdout to stderr so that users must return values from functions.
    old_stdout = os.fdopen(os.dup(1), 'w')
    os.dup2(2, 1)
    main(sys.stdin, old_stdout)
Add Apache license headers and LICENSE and NOTICE files 2013-07-16 20:21:33 -04:00			`#`
			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

Add Python API. 2012-08-10 04:10:02 -04:00			`"""`
			`Worker that receives input from Piped RDD.`
			`"""`
Fix stdout redirection in PySpark. 2013-02-01 03:25:19 -05:00			`import os`
Add Python API. 2012-08-10 04:10:02 -04:00			`import sys`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`import time`
SPARK-673: Capture and re-throw Python exceptions This patch alters the Python <-> executor protocol to pass on exception data when they occur in user Python code. 2013-01-31 21:02:28 -05:00			`import traceback`
Add Python API. 2012-08-10 04:10:02 -04:00			`from base64 import standard_b64decode`
			`# CloudPickler needs to be imported so that depicklers are registered using the`
			`# copy_reg module.`
Added accumulators to PySpark 2013-01-20 04:57:44 -05:00			`from pyspark.accumulators import _accumulatorRegistry`
Add broadcast variables to Python API. 2012-08-25 16:59:01 -04:00			`from pyspark.broadcast import Broadcast, _broadcastRegistry`
Bundle cloudpickle with pyspark. 2012-08-19 20:12:51 -04:00			`from pyspark.cloudpickle import CloudPickler`
Don't download files to master's working directory. This should avoid exceptions caused by existing files with different contents. I also removed some unused code. 2013-01-21 19:42:24 -05:00			`from pyspark.files import SparkFiles`
Added accumulators to PySpark 2013-01-20 04:57:44 -05:00			`from pyspark.serializers import write_with_length, read_with_length, write_int, \`
Add Python timing instrumentation 2013-03-10 16:54:46 -04:00			`read_long, write_long, read_int, dump_pickle, load_pickle, read_from_pickle_file`
Simplify Python worker; pipeline the map step of partitionBy(). 2012-08-25 19:46:07 -04:00
Add Python API. 2012-08-10 04:10:02 -04:00
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`def load_obj(infile):`
			`return load_pickle(standard_b64decode(infile.readline().strip()))`
Add Python API. 2012-08-10 04:10:02 -04:00

Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`def report_times(outfile, boot, init, finish):`
			`write_int(-3, outfile)`
			`write_long(1000 * boot, outfile)`
			`write_long(1000 * init, outfile)`
			`write_long(1000 * finish, outfile)`
Add Python timing instrumentation 2013-03-10 16:54:46 -04:00

Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`def main(infile, outfile):`
Add Python timing instrumentation 2013-03-10 16:54:46 -04:00			`boot_time = time.time()`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`split_index = read_int(infile)`
Add tests and fixes for Python daemon shutdown 2013-05-10 18:48:48 -04:00			`if split_index == -1: # for unit tests`
			`return`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`spark_files_dir = load_pickle(read_with_length(infile))`
Don't download files to master's working directory. This should avoid exceptions caused by existing files with different contents. I also removed some unused code. 2013-01-21 19:42:24 -05:00			`SparkFiles._root_directory = spark_files_dir`
Allow PySpark's SparkFiles to be used from driver Fix minor documentation formatting issues. 2013-01-23 13:36:18 -05:00			`SparkFiles._is_running_on_worker = True`
Fix sys.path bug in PySpark SparkContext.addPyFile 2013-01-22 20:54:11 -05:00			`sys.path.append(spark_files_dir)`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`num_broadcast_variables = read_int(infile)`
Add broadcast variables to Python API. 2012-08-25 16:59:01 -04:00			`for _ in range(num_broadcast_variables):`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`bid = read_long(infile)`
			`value = read_with_length(infile)`
Update Python API for v0.6.0 compatibility. 2012-10-19 13:24:49 -04:00			`_broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`func = load_obj(infile)`
			`bypassSerializer = load_obj(infile)`
Simplify Python worker; pipeline the map step of partitionBy(). 2012-08-25 19:46:07 -04:00			`if bypassSerializer:`
			`dumps = lambda x: x`
Add Python API. 2012-08-10 04:10:02 -04:00			`else:`
Simplify Python worker; pipeline the map step of partitionBy(). 2012-08-25 19:46:07 -04:00			`dumps = dump_pickle`
Add Python timing instrumentation 2013-03-10 16:54:46 -04:00			`init_time = time.time()`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`iterator = read_from_pickle_file(infile)`
SPARK-673: Capture and re-throw Python exceptions This patch alters the Python <-> executor protocol to pass on exception data when they occur in user Python code. 2013-01-31 21:02:28 -05:00			`try:`
			`for obj in func(split_index, iterator):`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`write_with_length(dumps(obj), outfile)`
SPARK-673: Capture and re-throw Python exceptions This patch alters the Python <-> executor protocol to pass on exception data when they occur in user Python code. 2013-01-31 21:02:28 -05:00			`except Exception as e:`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`write_int(-2, outfile)`
			`write_with_length(traceback.format_exc(), outfile)`
Fix reporting of PySpark exceptions 2013-06-21 12:13:48 -04:00			`sys.exit(-1)`
Add Python timing instrumentation 2013-03-10 16:54:46 -04:00			`finish_time = time.time()`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`report_times(outfile, boot_time, init_time, finish_time)`
Added accumulators to PySpark 2013-01-20 04:57:44 -05:00			`# Mark the beginning of the accumulators section of the output`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`write_int(-1, outfile)`
Added accumulators to PySpark 2013-01-20 04:57:44 -05:00			`for aid, accum in _accumulatorRegistry.items():`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`write_with_length(dump_pickle((aid, accum._value)), outfile)`
			`write_int(-1, outfile)`
Add Python API. 2012-08-10 04:10:02 -04:00

			`if __name__ == '__main__':`
Prefork Python worker processes 2013-05-06 19:34:30 -04:00			`# Redirect stdout to stderr so that users must return values from functions.`
			`old_stdout = os.fdopen(os.dup(1), 'w')`
			`os.dup2(2, 1)`
			`main(sys.stdin, old_stdout)`