From b573f23ed18a19e5ca2b51e3a452d2d5f716729d Mon Sep 17 00:00:00 2001
From: Shahin Shakeri <shahin.shakeri@pwc.com>
Date: Mon, 16 Dec 2019 10:11:50 -0800
Subject: [PATCH] [SPARK-29574][K8S] Add SPARK_DIST_CLASSPATH to the executor
 class path

### What changes were proposed in this pull request?
Include `$SPARK_DIST_CLASSPATH` in class path when launching `CoarseGrainedExecutorBackend` on Kubernetes executors using the provided `entrypoint.sh`

### Why are the changes needed?
For user provided Hadoop, `$SPARK_DIST_CLASSPATH` contains the required jars.

### Does this PR introduce any user-facing change?
no

### How was this patch tested?
Kubernetes 1.14, Spark 2.4.4, Hadoop 3.2.1. Adding $SPARK_DIST_CLASSPATH to  `-cp ` param of entrypoint.sh enables launching the executors correctly.

Closes #26493 from sshakeri/master.

Authored-by: Shahin Shakeri <shahin.shakeri@pwc.com>
Signed-off-by: Marcelo Vanzin <vanzin@cloudera.com>
---
 docs/hadoop-provided.md                       | 22 +++++++++++++++++++
 .../src/main/dockerfiles/spark/entrypoint.sh  |  8 ++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/docs/hadoop-provided.md b/docs/hadoop-provided.md
index 37cdaa6150..6442e947ab 100644
--- a/docs/hadoop-provided.md
+++ b/docs/hadoop-provided.md
@@ -39,3 +39,25 @@ export SPARK_DIST_CLASSPATH=$(/path/to/hadoop/bin/hadoop classpath)
 export SPARK_DIST_CLASSPATH=$(hadoop --config /path/to/configs classpath)
 
 {% endhighlight %}
+
+# Hadoop Free Build Setup for Spark on Kubernetes  
+To run the Hadoop free build of Spark on Kubernetes, the executor image must have the appropriate version of Hadoop binaries and the correct `SPARK_DIST_CLASSPATH` value set. See the example below for the relevant changes needed in the executor Dockerfile:
+
+{% highlight bash %}
+### Set environment variables in the executor dockerfile ###
+
+ENV SPARK_HOME="/opt/spark"  
+ENV HADOOP_HOME="/opt/hadoop"  
+ENV PATH="$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH"  
+...  
+
+#Copy your target hadoop binaries to the executor hadoop home   
+
+COPY /opt/hadoop3  $HADOOP_HOME  
+...
+
+#Copy and use the Spark provided entrypoint.sh. It sets your SPARK_DIST_CLASSPATH using the hadoop binary in $HADOOP_HOME and starts the executor. If you choose to customize the value of SPARK_DIST_CLASSPATH here, the value will be retained in entrypoint.sh
+
+ENTRYPOINT [ "/opt/entrypoint.sh" ]
+...  
+{% endhighlight %}
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
index 4fe8df61ef..6ee3523c8e 100755
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -56,6 +56,12 @@ elif [ "$PYSPARK_MAJOR_PYTHON_VERSION" == "3" ]; then
     export PYSPARK_DRIVER_PYTHON="python3"
 fi
 
+# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
+# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
+if [ -n ${HADOOP_HOME}  ] && [ -z ${SPARK_DIST_CLASSPATH}  ]; then
+  export SPARK_DIST_CLASSPATH=$($HADOOP_HOME/bin/hadoop classpath)  
+fi
+
 if ! [ -z ${HADOOP_CONF_DIR+x} ]; then
   SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
 fi
@@ -77,7 +83,7 @@ case "$1" in
       "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
       -Xms$SPARK_EXECUTOR_MEMORY
       -Xmx$SPARK_EXECUTOR_MEMORY
-      -cp "$SPARK_CLASSPATH"
+      -cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
       org.apache.spark.executor.CoarseGrainedExecutorBackend
       --driver-url $SPARK_DRIVER_URL
       --executor-id $SPARK_EXECUTOR_ID