[SPARK-21368][SQL] TPCDSQueryBenchmark can't refer query files.

## What changes were proposed in this pull request? TPCDSQueryBenchmark packaged into a jar doesn't work with spark-submit. It's because of the failure of reference query files in the jar file. ## How was this patch tested? Ran the benchmark. Author: sarutak <sarutak@oss.nttdata.co.jp> Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp> Closes #18592 from sarutak/fix-tpcds-benchmark.
2017-09-12 10:49:46 -07:00 · 2017-09-12 10:49:46 -07:00 · b9b54b1c88
parent 720c94fe77
commit b9b54b1c88
2 changed files with 74 additions and 14 deletions
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala
@ -17,8 +17,6 @@
 package org.apache.spark.sql.execution.benchmark
 import java.io.File
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.TableIdentifier
@ -31,7 +29,7 @@ import org.apache.spark.util.Benchmark
 /**
 * Benchmark to measure TPCDS query performance.
 * To run this:
- *  spark-submit --class <this class> --jars <spark sql test jar>
+ *  spark-submit --class <this class> <spark sql test jar> <TPCDS data location>
 */
 object TPCDSQueryBenchmark {
  val conf =
@ -61,12 +59,10 @@ object TPCDSQueryBenchmark {
  }
  def tpcdsAll(dataLocation: String, queries: Seq[String]): Unit = {
    require(dataLocation.nonEmpty,
      "please modify the value of dataLocation to point to your local TPCDS data")
    val tableSizes = setupTables(dataLocation)
    queries.foreach { name =>
-      val queryString = fileToString(new File(Thread.currentThread().getContextClassLoader
+      val queryString = resourceToString(s"tpcds/$name.sql",
-        .getResource(s"tpcds/$name.sql").getFile))
+        classLoader = Thread.currentThread().getContextClassLoader)
      // This is an indirect hack to estimate the size of each query's input by traversing the
      // logical plan and adding up the sizes of all tables that appear in the plan. Note that this
@ -99,6 +95,7 @@ object TPCDSQueryBenchmark {
  }
  def main(args: Array[String]): Unit = {
    val benchmarkArgs = new TPCDSQueryBenchmarkArguments(args)
    // List of all TPC-DS queries
    val tpcdsQueries = Seq(
@ -113,12 +110,6 @@ object TPCDSQueryBenchmark {
      "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
      "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
-    // In order to run this benchmark, please follow the instructions at
+    tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries)
    // https://github.com/databricks/spark-sql-perf/blob/master/README.md to generate the TPCDS data
    // locally (preferably with a scale factor of 5 for benchmarking). Thereafter, the value of
    // dataLocation below needs to be set to the location where the generated data is stored.
    val dataLocation = ""
    tpcdsAll(dataLocation, queries = tpcdsQueries)
  }
 }
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala
@ -0,0 +1,69 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.spark.sql.execution.benchmark
 class TPCDSQueryBenchmarkArguments(val args: Array[String]) {
  var dataLocation: String = null
  parseArgs(args.toList)
  validateArguments()
  private def parseArgs(inputArgs: List[String]): Unit = {
    var args = inputArgs
    while(args.nonEmpty) {
      args match {
        case ("--data-location") :: value :: tail =>
          dataLocation = value
          args = tail
        case _ =>
          // scalastyle:off println
          System.err.println("Unknown/unsupported param " + args)
          // scalastyle:on println
          printUsageAndExit(1)
      }
    }
  }
  private def printUsageAndExit(exitCode: Int): Unit = {
    // scalastyle:off
    System.err.println("""
      |Usage: spark-submit --class <this class> <spark sql test jar> [Options]
      |Options:
      |  --data-location      Path to TPCDS data
      |
      |------------------------------------------------------------------------------------------------------------------
      |In order to run this benchmark, please follow the instructions at
      |https://github.com/databricks/spark-sql-perf/blob/master/README.md
      |to generate the TPCDS data locally (preferably with a scale factor of 5 for benchmarking).
      |Thereafter, the value of <TPCDS data location> needs to be set to the location where the generated data is stored.
      """.stripMargin)
    // scalastyle:on
    System.exit(exitCode)
  }
  private def validateArguments(): Unit = {
    if (dataLocation == null) {
      // scalastyle:off println
      System.err.println("Must specify a data location")
      // scalastyle:on println
      printUsageAndExit(-1)
    }
  }
 }