From b573f23ed18a19e5ca2b51e3a452d2d5f716729d Mon Sep 17 00:00:00 2001 From: Shahin Shakeri Date: Mon, 16 Dec 2019 10:11:50 -0800 Subject: [PATCH] [SPARK-29574][K8S] Add SPARK_DIST_CLASSPATH to the executor class path ### What changes were proposed in this pull request? Include `$SPARK_DIST_CLASSPATH` in class path when launching `CoarseGrainedExecutorBackend` on Kubernetes executors using the provided `entrypoint.sh` ### Why are the changes needed? For user provided Hadoop, `$SPARK_DIST_CLASSPATH` contains the required jars. ### Does this PR introduce any user-facing change? no ### How was this patch tested? Kubernetes 1.14, Spark 2.4.4, Hadoop 3.2.1. Adding $SPARK_DIST_CLASSPATH to `-cp ` param of entrypoint.sh enables launching the executors correctly. Closes #26493 from sshakeri/master. Authored-by: Shahin Shakeri Signed-off-by: Marcelo Vanzin --- docs/hadoop-provided.md | 22 +++++++++++++++++++ .../src/main/dockerfiles/spark/entrypoint.sh | 8 ++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/hadoop-provided.md b/docs/hadoop-provided.md index 37cdaa6150..6442e947ab 100644 --- a/docs/hadoop-provided.md +++ b/docs/hadoop-provided.md @@ -39,3 +39,25 @@ export SPARK_DIST_CLASSPATH=$(/path/to/hadoop/bin/hadoop classpath) export SPARK_DIST_CLASSPATH=$(hadoop --config /path/to/configs classpath) {% endhighlight %} + +# Hadoop Free Build Setup for Spark on Kubernetes +To run the Hadoop free build of Spark on Kubernetes, the executor image must have the appropriate version of Hadoop binaries and the correct `SPARK_DIST_CLASSPATH` value set. See the example below for the relevant changes needed in the executor Dockerfile: + +{% highlight bash %} +### Set environment variables in the executor dockerfile ### + +ENV SPARK_HOME="/opt/spark" +ENV HADOOP_HOME="/opt/hadoop" +ENV PATH="$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH" +... + +#Copy your target hadoop binaries to the executor hadoop home + +COPY /opt/hadoop3 $HADOOP_HOME +... + +#Copy and use the Spark provided entrypoint.sh. It sets your SPARK_DIST_CLASSPATH using the hadoop binary in $HADOOP_HOME and starts the executor. If you choose to customize the value of SPARK_DIST_CLASSPATH here, the value will be retained in entrypoint.sh + +ENTRYPOINT [ "/opt/entrypoint.sh" ] +... +{% endhighlight %} diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh index 4fe8df61ef..6ee3523c8e 100755 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh @@ -56,6 +56,12 @@ elif [ "$PYSPARK_MAJOR_PYTHON_VERSION" == "3" ]; then export PYSPARK_DRIVER_PYTHON="python3" fi +# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor. +# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s. +if [ -n ${HADOOP_HOME} ] && [ -z ${SPARK_DIST_CLASSPATH} ]; then + export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath) +fi + if ! [ -z ${HADOOP_CONF_DIR+x} ]; then SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH"; fi @@ -77,7 +83,7 @@ case "$1" in "${SPARK_EXECUTOR_JAVA_OPTS[@]}" -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY - -cp "$SPARK_CLASSPATH" + -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH" org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID