SPARK-2310. Support arbitrary Spark properties on the command line with ...

...spark-submit

The PR allows invocations like
  spark-submit --class org.MyClass --spark.shuffle.spill false myjar.jar

Author: Sandy Ryza <sandy@cloudera.com>

Closes #1253 from sryza/sandy-spark-2310 and squashes the following commits:

1dc9855 [Sandy Ryza] More doc and cleanup
00edfb9 [Sandy Ryza] Review comments
91b244a [Sandy Ryza] Change format to --conf PROP=VALUE
8fabe77 [Sandy Ryza] SPARK-2310. Support arbitrary Spark properties on the command line with spark-submit
This commit is contained in:
Sandy Ryza 2014-07-23 23:09:25 -07:00 committed by Patrick Wendell
parent 78d18fdbaa
commit e34922a221
5 changed files with 32 additions and 4 deletions

View file

@ -269,6 +269,9 @@ object SparkSubmit {
sysProps.getOrElseUpdate(k, v)
}
// Spark properties included on command line take precedence
sysProps ++= args.sparkProperties
(childArgs, childClasspath, sysProps, childMainClass)
}

View file

@ -55,6 +55,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
var verbose: Boolean = false
var isPython: Boolean = false
var pyFiles: String = null
val sparkProperties: HashMap[String, String] = new HashMap[String, String]()
parseOpts(args.toList)
loadDefaults()
@ -177,6 +178,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
| executorCores $executorCores
| totalExecutorCores $totalExecutorCores
| propertiesFile $propertiesFile
| extraSparkProperties $sparkProperties
| driverMemory $driverMemory
| driverCores $driverCores
| driverExtraClassPath $driverExtraClassPath
@ -290,6 +292,13 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
jars = Utils.resolveURIs(value)
parse(tail)
case ("--conf" | "-c") :: value :: tail =>
value.split("=", 2).toSeq match {
case Seq(k, v) => sparkProperties(k) = v
case _ => SparkSubmit.printErrorAndExit(s"Spark config without '=': $value")
}
parse(tail)
case ("--help" | "-h") :: tail =>
printUsageAndExit(0)
@ -349,6 +358,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
| on the PYTHONPATH for Python apps.
| --files FILES Comma-separated list of files to be placed in the working
| directory of each executor.
|
| --conf PROP=VALUE Arbitrary Spark configuration property.
| --properties-file FILE Path to a file from which to load extra properties. If not
| specified, this will look for conf/spark-defaults.conf.
|

View file

@ -120,6 +120,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
"--archives", "archive1.txt,archive2.txt",
"--num-executors", "6",
"--name", "beauty",
"--conf", "spark.shuffle.spill=false",
"thejar.jar",
"arg1", "arg2")
val appArgs = new SparkSubmitArguments(clArgs)
@ -139,6 +140,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
mainClass should be ("org.apache.spark.deploy.yarn.Client")
classpath should have length (0)
sysProps("spark.app.name") should be ("beauty")
sysProps("spark.shuffle.spill") should be ("false")
sysProps("SPARK_SUBMIT") should be ("true")
}
@ -156,6 +158,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
"--archives", "archive1.txt,archive2.txt",
"--num-executors", "6",
"--name", "trill",
"--conf", "spark.shuffle.spill=false",
"thejar.jar",
"arg1", "arg2")
val appArgs = new SparkSubmitArguments(clArgs)
@ -176,6 +179,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
sysProps("spark.yarn.dist.archives") should include regex (".*archive1.txt,.*archive2.txt")
sysProps("spark.jars") should include regex (".*one.jar,.*two.jar,.*three.jar,.*thejar.jar")
sysProps("SPARK_SUBMIT") should be ("true")
sysProps("spark.shuffle.spill") should be ("false")
}
test("handles standalone cluster mode") {
@ -186,6 +190,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
"--supervise",
"--driver-memory", "4g",
"--driver-cores", "5",
"--conf", "spark.shuffle.spill=false",
"thejar.jar",
"arg1", "arg2")
val appArgs = new SparkSubmitArguments(clArgs)
@ -195,9 +200,10 @@ class SparkSubmitSuite extends FunSuite with Matchers {
childArgsStr should include regex ("launch spark://h:p .*thejar.jar org.SomeClass arg1 arg2")
mainClass should be ("org.apache.spark.deploy.Client")
classpath should have size (0)
sysProps should have size (2)
sysProps should have size (3)
sysProps.keys should contain ("spark.jars")
sysProps.keys should contain ("SPARK_SUBMIT")
sysProps("spark.shuffle.spill") should be ("false")
}
test("handles standalone client mode") {
@ -208,6 +214,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
"--total-executor-cores", "5",
"--class", "org.SomeClass",
"--driver-memory", "4g",
"--conf", "spark.shuffle.spill=false",
"thejar.jar",
"arg1", "arg2")
val appArgs = new SparkSubmitArguments(clArgs)
@ -218,6 +225,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
classpath(0) should endWith ("thejar.jar")
sysProps("spark.executor.memory") should be ("5g")
sysProps("spark.cores.max") should be ("5")
sysProps("spark.shuffle.spill") should be ("false")
}
test("handles mesos client mode") {
@ -228,6 +236,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
"--total-executor-cores", "5",
"--class", "org.SomeClass",
"--driver-memory", "4g",
"--conf", "spark.shuffle.spill=false",
"thejar.jar",
"arg1", "arg2")
val appArgs = new SparkSubmitArguments(clArgs)
@ -238,6 +247,7 @@ class SparkSubmitSuite extends FunSuite with Matchers {
classpath(0) should endWith ("thejar.jar")
sysProps("spark.executor.memory") should be ("5g")
sysProps("spark.cores.max") should be ("5")
sysProps("spark.shuffle.spill") should be ("false")
}
test("launch simple application with spark-submit") {

View file

@ -42,13 +42,15 @@ val sc = new SparkContext(new SparkConf())
Then, you can supply configuration values at runtime:
{% highlight bash %}
./bin/spark-submit --name "My fancy app" --master local[4] myApp.jar
./bin/spark-submit --name "My app" --master local[4] --conf spark.shuffle.spill=false
--conf "spark.executor.extraJavaOptions=-XX:+PrintGCDetails -XX:+PrintGCTimeStamps" myApp.jar
{% endhighlight %}
The Spark shell and [`spark-submit`](cluster-overview.html#launching-applications-with-spark-submit)
tool support two ways to load configurations dynamically. The first are command line options,
such as `--master`, as shown above. Running `./bin/spark-submit --help` will show the entire list
of options.
such as `--master`, as shown above. `spark-submit` can accept any Spark property using the `--conf`
flag, but uses special flags for properties that play a part in launching the Spark application.
Running `./bin/spark-submit --help` will show the entire list of these options.
`bin/spark-submit` will also read configuration options from `conf/spark-defaults.conf`, in which
each line consists of a key and a value separated by whitespace. For example:

View file

@ -33,6 +33,7 @@ dependencies, and can support different cluster managers and deploy modes that S
--class <main-class>
--master <master-url> \
--deploy-mode <deploy-mode> \
--conf <key>=<value> \
... # other options
<application-jar> \
[application-arguments]
@ -43,6 +44,7 @@ Some of the commonly used options are:
* `--class`: The entry point for your application (e.g. `org.apache.spark.examples.SparkPi`)
* `--master`: The [master URL](#master-urls) for the cluster (e.g. `spark://23.195.26.187:7077`)
* `--deploy-mode`: Whether to deploy your driver on the worker nodes (`cluster`) or locally as an external client (`client`) (default: `client`)*
* `--conf`: Arbitrary Spark configuration property in key=value format. For values that contain spaces wrap "key=value" in quotes (as shown).
* `application-jar`: Path to a bundled jar including your application and all dependencies. The URL must be globally visible inside of your cluster, for instance, an `hdfs://` path or a `file://` path that is present on all nodes.
* `application-arguments`: Arguments passed to the main method of your main class, if any