103 lines
4 KiB
R
103 lines
4 KiB
R
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
# Worker daemon
|
|
|
|
rLibDir <- Sys.getenv("SPARKR_RLIBDIR")
|
|
connectionTimeout <- as.integer(Sys.getenv("SPARKR_BACKEND_CONNECTION_TIMEOUT", "6000"))
|
|
dirs <- strsplit(rLibDir, ",")[[1]]
|
|
script <- file.path(dirs[[1]], "SparkR", "worker", "worker.R")
|
|
|
|
# preload SparkR package, speedup worker
|
|
.libPaths(c(dirs, .libPaths()))
|
|
suppressPackageStartupMessages(library(SparkR))
|
|
|
|
port <- as.integer(Sys.getenv("SPARKR_WORKER_PORT"))
|
|
inputCon <- socketConnection(
|
|
port = port, open = "wb", blocking = TRUE, timeout = connectionTimeout)
|
|
|
|
SparkR:::doServerAuth(inputCon, Sys.getenv("SPARKR_WORKER_SECRET"))
|
|
|
|
# Waits indefinitely for a socket connecion by default.
|
|
selectTimeout <- NULL
|
|
|
|
while (TRUE) {
|
|
ready <- socketSelect(list(inputCon), timeout = selectTimeout)
|
|
|
|
# Note that the children should be terminated in the parent. If each child terminates
|
|
# itself, it appears that the resource is not released properly, that causes an unexpected
|
|
# termination of this daemon due to, for example, running out of file descriptors
|
|
# (see SPARK-21093). Therefore, the current implementation tries to retrieve children
|
|
# that are exited (but not terminated) and then sends a kill signal to terminate them properly
|
|
# in the parent.
|
|
#
|
|
# There are two paths that it attempts to send a signal to terminate the children in the parent.
|
|
#
|
|
# 1. Every second if any socket connection is not available and if there are child workers
|
|
# running.
|
|
# 2. Right after a socket connection is available.
|
|
#
|
|
# In other words, the parent attempts to send the signal to the children every second if
|
|
# any worker is running or right before launching other worker children from the following
|
|
# new socket connection.
|
|
|
|
# The process IDs of exited children are returned below.
|
|
children <- parallel:::selectChildren(timeout = 0)
|
|
|
|
if (is.integer(children)) {
|
|
lapply(children, function(child) {
|
|
# This should be the PIDs of exited children. Otherwise, this returns raw bytes if any data
|
|
# was sent from this child. In this case, we discard it.
|
|
pid <- parallel:::readChild(child)
|
|
if (is.integer(pid)) {
|
|
# This checks if the data from this child is the same pid of this selected child.
|
|
if (child == pid) {
|
|
# If so, we terminate this child.
|
|
tools::pskill(child, tools::SIGUSR1)
|
|
}
|
|
}
|
|
})
|
|
} else if (is.null(children)) {
|
|
# If it is NULL, there are no children. Waits indefinitely for a socket connecion.
|
|
selectTimeout <- NULL
|
|
}
|
|
|
|
if (ready) {
|
|
port <- SparkR:::readInt(inputCon)
|
|
# There is a small chance that it could be interrupted by signal, retry one time
|
|
if (length(port) == 0) {
|
|
port <- SparkR:::readInt(inputCon)
|
|
if (length(port) == 0) {
|
|
cat("quitting daemon\n")
|
|
quit(save = "no")
|
|
}
|
|
}
|
|
p <- parallel:::mcfork()
|
|
if (inherits(p, "masterProcess")) {
|
|
# Reach here because this is a child process.
|
|
close(inputCon)
|
|
Sys.setenv(SPARKR_WORKER_PORT = port)
|
|
try(source(script))
|
|
# Note that this mcexit does not fully terminate this child.
|
|
parallel:::mcexit(0L)
|
|
} else {
|
|
# Forking succeeded and we need to check if they finished their jobs every second.
|
|
selectTimeout <- 1
|
|
}
|
|
}
|
|
}
|