2013-05-06 19:34:30 -04:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import multiprocessing
|
2013-05-23 14:50:24 -04:00
|
|
|
from ctypes import c_bool
|
2013-05-06 19:34:30 -04:00
|
|
|
from errno import EINTR, ECHILD
|
|
|
|
from socket import socket, AF_INET, SOCK_STREAM, SOMAXCONN
|
|
|
|
from signal import signal, SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, SIG_IGN
|
|
|
|
from pyspark.worker import main as worker_main
|
|
|
|
from pyspark.serializers import write_int
|
|
|
|
|
|
|
|
try:
|
|
|
|
POOLSIZE = multiprocessing.cpu_count()
|
|
|
|
except NotImplementedError:
|
|
|
|
POOLSIZE = 4
|
|
|
|
|
2013-05-23 14:50:24 -04:00
|
|
|
exit_flag = multiprocessing.Value(c_bool, False)
|
|
|
|
|
|
|
|
|
|
|
|
def should_exit():
|
|
|
|
global exit_flag
|
|
|
|
return exit_flag.value
|
2013-05-06 19:34:30 -04:00
|
|
|
|
|
|
|
|
2013-06-21 12:13:48 -04:00
|
|
|
def compute_real_exit_code(exit_code):
|
|
|
|
# SystemExit's code can be integer or string, but os._exit only accepts integers
|
|
|
|
import numbers
|
|
|
|
if isinstance(exit_code, numbers.Integral):
|
|
|
|
return exit_code
|
|
|
|
else:
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
2013-05-06 19:34:30 -04:00
|
|
|
def worker(listen_sock):
|
|
|
|
# Redirect stdout to stderr
|
|
|
|
os.dup2(2, 1)
|
|
|
|
|
|
|
|
# Manager sends SIGHUP to request termination of workers in the pool
|
2013-05-23 14:50:24 -04:00
|
|
|
def handle_sighup(*args):
|
|
|
|
assert should_exit()
|
2013-05-06 19:34:30 -04:00
|
|
|
signal(SIGHUP, handle_sighup)
|
|
|
|
|
2013-05-23 14:50:24 -04:00
|
|
|
# Cleanup zombie children
|
|
|
|
def handle_sigchld(*args):
|
|
|
|
pid = status = None
|
|
|
|
try:
|
|
|
|
while (pid, status) != (0, 0):
|
|
|
|
pid, status = os.waitpid(0, os.WNOHANG)
|
|
|
|
except EnvironmentError as err:
|
|
|
|
if err.errno == EINTR:
|
|
|
|
# retry
|
|
|
|
handle_sigchld()
|
|
|
|
elif err.errno != ECHILD:
|
|
|
|
raise
|
|
|
|
signal(SIGCHLD, handle_sigchld)
|
|
|
|
|
|
|
|
# Handle clients
|
|
|
|
while not should_exit():
|
2013-05-06 19:34:30 -04:00
|
|
|
# Wait until a client arrives or we have to exit
|
|
|
|
sock = None
|
2013-05-23 14:50:24 -04:00
|
|
|
while not should_exit() and sock is None:
|
2013-05-06 19:34:30 -04:00
|
|
|
try:
|
|
|
|
sock, addr = listen_sock.accept()
|
|
|
|
except EnvironmentError as err:
|
|
|
|
if err.errno != EINTR:
|
|
|
|
raise
|
|
|
|
|
|
|
|
if sock is not None:
|
2013-05-23 14:50:24 -04:00
|
|
|
# Fork a child to handle the client.
|
|
|
|
# The client is handled in the child so that the manager
|
|
|
|
# never receives SIGCHLD unless a worker crashes.
|
|
|
|
if os.fork() == 0:
|
2013-05-06 19:34:30 -04:00
|
|
|
# Leave the worker pool
|
|
|
|
signal(SIGHUP, SIG_DFL)
|
|
|
|
listen_sock.close()
|
|
|
|
# Handle the client then exit
|
|
|
|
sockfile = sock.makefile()
|
2013-06-21 12:13:48 -04:00
|
|
|
exit_code = 0
|
|
|
|
try:
|
|
|
|
worker_main(sockfile, sockfile)
|
|
|
|
except SystemExit as exc:
|
|
|
|
exit_code = exc.code
|
|
|
|
finally:
|
|
|
|
sockfile.close()
|
|
|
|
sock.close()
|
|
|
|
os._exit(compute_real_exit_code(exit_code))
|
2013-05-06 19:34:30 -04:00
|
|
|
else:
|
|
|
|
sock.close()
|
|
|
|
|
2013-05-23 14:50:24 -04:00
|
|
|
|
|
|
|
def launch_worker(listen_sock):
|
|
|
|
if os.fork() == 0:
|
|
|
|
try:
|
|
|
|
worker(listen_sock)
|
|
|
|
except Exception as err:
|
|
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
|
|
os._exit(1)
|
|
|
|
else:
|
|
|
|
assert should_exit()
|
|
|
|
os._exit(0)
|
2013-05-06 19:34:30 -04:00
|
|
|
|
|
|
|
|
|
|
|
def manager():
|
|
|
|
# Create a new process group to corral our children
|
|
|
|
os.setpgid(0, 0)
|
|
|
|
|
|
|
|
# Create a listening socket on the AF_INET loopback interface
|
|
|
|
listen_sock = socket(AF_INET, SOCK_STREAM)
|
|
|
|
listen_sock.bind(('127.0.0.1', 0))
|
|
|
|
listen_sock.listen(max(1024, 2 * POOLSIZE, SOMAXCONN))
|
|
|
|
listen_host, listen_port = listen_sock.getsockname()
|
|
|
|
write_int(listen_port, sys.stdout)
|
|
|
|
|
|
|
|
# Launch initial worker pool
|
|
|
|
for idx in range(POOLSIZE):
|
2013-05-23 14:50:24 -04:00
|
|
|
launch_worker(listen_sock)
|
2013-05-06 19:34:30 -04:00
|
|
|
listen_sock.close()
|
|
|
|
|
|
|
|
def shutdown():
|
2013-05-23 14:50:24 -04:00
|
|
|
global exit_flag
|
|
|
|
exit_flag.value = True
|
2013-05-06 19:34:30 -04:00
|
|
|
|
|
|
|
# Gracefully exit on SIGTERM, don't die on SIGHUP
|
|
|
|
signal(SIGTERM, lambda signum, frame: shutdown())
|
|
|
|
signal(SIGHUP, SIG_IGN)
|
|
|
|
|
|
|
|
# Cleanup zombie children
|
2013-05-23 14:50:24 -04:00
|
|
|
def handle_sigchld(*args):
|
2013-05-06 19:34:30 -04:00
|
|
|
try:
|
|
|
|
pid, status = os.waitpid(0, os.WNOHANG)
|
2013-05-23 14:50:24 -04:00
|
|
|
if status != 0 and not should_exit():
|
2013-05-10 18:48:48 -04:00
|
|
|
raise RuntimeError("worker crashed: %s, %s" % (pid, status))
|
2013-05-06 19:34:30 -04:00
|
|
|
except EnvironmentError as err:
|
|
|
|
if err.errno not in (ECHILD, EINTR):
|
|
|
|
raise
|
|
|
|
signal(SIGCHLD, handle_sigchld)
|
|
|
|
|
|
|
|
# Initialization complete
|
|
|
|
sys.stdout.close()
|
2013-05-10 18:48:48 -04:00
|
|
|
try:
|
2013-05-23 14:50:24 -04:00
|
|
|
while not should_exit():
|
2013-05-10 18:48:48 -04:00
|
|
|
try:
|
|
|
|
# Spark tells us to exit by closing stdin
|
|
|
|
if os.read(0, 512) == '':
|
|
|
|
shutdown()
|
|
|
|
except EnvironmentError as err:
|
|
|
|
if err.errno != EINTR:
|
|
|
|
shutdown()
|
|
|
|
raise
|
|
|
|
finally:
|
2013-05-23 14:50:24 -04:00
|
|
|
signal(SIGTERM, SIG_DFL)
|
|
|
|
exit_flag.value = True
|
2013-05-10 18:48:48 -04:00
|
|
|
# Send SIGHUP to notify workers of shutdown
|
|
|
|
os.kill(0, SIGHUP)
|
2013-05-06 19:34:30 -04:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
manager()
|