[SPARK-21368][SQL] TPCDSQueryBenchmark can't refer query files.

## What changes were proposed in this pull request?

TPCDSQueryBenchmark packaged into a jar doesn't work with spark-submit.
It's because of the failure of reference query files in the jar file.

## How was this patch tested?

Ran the benchmark.

Author: sarutak <sarutak@oss.nttdata.co.jp>
Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #18592 from sarutak/fix-tpcds-benchmark.
This commit is contained in:
sarutak 2017-09-12 10:49:46 -07:00 committed by gatorsmile
parent 720c94fe77
commit b9b54b1c88
2 changed files with 74 additions and 14 deletions

View file

@ -17,8 +17,6 @@
package org.apache.spark.sql.execution.benchmark
import java.io.File
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.TableIdentifier
@ -31,7 +29,7 @@ import org.apache.spark.util.Benchmark
/**
* Benchmark to measure TPCDS query performance.
* To run this:
* spark-submit --class <this class> --jars <spark sql test jar>
* spark-submit --class <this class> <spark sql test jar> <TPCDS data location>
*/
object TPCDSQueryBenchmark {
val conf =
@ -61,12 +59,10 @@ object TPCDSQueryBenchmark {
}
def tpcdsAll(dataLocation: String, queries: Seq[String]): Unit = {
require(dataLocation.nonEmpty,
"please modify the value of dataLocation to point to your local TPCDS data")
val tableSizes = setupTables(dataLocation)
queries.foreach { name =>
val queryString = fileToString(new File(Thread.currentThread().getContextClassLoader
.getResource(s"tpcds/$name.sql").getFile))
val queryString = resourceToString(s"tpcds/$name.sql",
classLoader = Thread.currentThread().getContextClassLoader)
// This is an indirect hack to estimate the size of each query's input by traversing the
// logical plan and adding up the sizes of all tables that appear in the plan. Note that this
@ -99,6 +95,7 @@ object TPCDSQueryBenchmark {
}
def main(args: Array[String]): Unit = {
val benchmarkArgs = new TPCDSQueryBenchmarkArguments(args)
// List of all TPC-DS queries
val tpcdsQueries = Seq(
@ -113,12 +110,6 @@ object TPCDSQueryBenchmark {
"q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
"q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
// In order to run this benchmark, please follow the instructions at
// https://github.com/databricks/spark-sql-perf/blob/master/README.md to generate the TPCDS data
// locally (preferably with a scale factor of 5 for benchmarking). Thereafter, the value of
// dataLocation below needs to be set to the location where the generated data is stored.
val dataLocation = ""
tpcdsAll(dataLocation, queries = tpcdsQueries)
tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries)
}
}

View file

@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.benchmark
class TPCDSQueryBenchmarkArguments(val args: Array[String]) {
var dataLocation: String = null
parseArgs(args.toList)
validateArguments()
private def parseArgs(inputArgs: List[String]): Unit = {
var args = inputArgs
while(args.nonEmpty) {
args match {
case ("--data-location") :: value :: tail =>
dataLocation = value
args = tail
case _ =>
// scalastyle:off println
System.err.println("Unknown/unsupported param " + args)
// scalastyle:on println
printUsageAndExit(1)
}
}
}
private def printUsageAndExit(exitCode: Int): Unit = {
// scalastyle:off
System.err.println("""
|Usage: spark-submit --class <this class> <spark sql test jar> [Options]
|Options:
| --data-location Path to TPCDS data
|
|------------------------------------------------------------------------------------------------------------------
|In order to run this benchmark, please follow the instructions at
|https://github.com/databricks/spark-sql-perf/blob/master/README.md
|to generate the TPCDS data locally (preferably with a scale factor of 5 for benchmarking).
|Thereafter, the value of <TPCDS data location> needs to be set to the location where the generated data is stored.
""".stripMargin)
// scalastyle:on
System.exit(exitCode)
}
private def validateArguments(): Unit = {
if (dataLocation == null) {
// scalastyle:off println
System.err.println("Must specify a data location")
// scalastyle:on println
printUsageAndExit(-1)
}
}
}