From 6ffec5e6a63555509fb66cfe4b6f9d19bcd1e27c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 22 Oct 2019 10:47:04 +0900 Subject: [PATCH] [SPARK-29533][SQL][TEST] Benchmark casting strings to intervals ### What changes were proposed in this pull request? Added new benchmark `IntervalBenchmark` to measure performance of interval related functions. In the PR, I added benchmarks for casting strings to interval. In particular, interval strings with `interval` prefix and without it because there is special code for this https://github.com/apache/spark/blob/da576a737c2db01e5ba5ce19ed0e8f900cb5efaf/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java#L100-L103 . And also I added benchmarks for different number of units in interval strings, for example 1 unit is `interval 10 years`, 2 units w/o interval is `10 years 5 months`, and etc. ### Why are the changes needed? - To find out current performance issues in casting to intervals - The benchmark can be used while refactoring/re-implementing `CalendarInterval.fromString()` or `CalendarInterval.fromCaseInsensitiveString()`. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By running the benchmark via the command: ```shell SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.IntervalBenchmark" ``` Closes #26189 from MaxGekk/interval-from-string-benchmark. Authored-by: Maxim Gekk Signed-off-by: HyukjinKwon --- .../IntervalBenchmark-jdk11-results.txt | 25 +++++ .../benchmarks/IntervalBenchmark-results.txt | 25 +++++ .../benchmark/IntervalBenchmark.scala | 102 ++++++++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 sql/core/benchmarks/IntervalBenchmark-jdk11-results.txt create mode 100644 sql/core/benchmarks/IntervalBenchmark-results.txt create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala diff --git a/sql/core/benchmarks/IntervalBenchmark-jdk11-results.txt b/sql/core/benchmarks/IntervalBenchmark-jdk11-results.txt new file mode 100644 index 0000000000..2a3903200a --- /dev/null +++ b/sql/core/benchmarks/IntervalBenchmark-jdk11-results.txt @@ -0,0 +1,25 @@ +OpenJDK 64-Bit Server VM 11.0.2+9 on Mac OS X 10.15 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +cast strings to intervals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +string w/ interval 471 513 57 2.1 470.7 1.0X +string w/o interval 437 444 8 2.3 436.9 1.1X +1 units w/ interval 726 758 45 1.4 726.3 0.6X +1 units w/o interval 712 717 5 1.4 711.7 0.7X +2 units w/ interval 926 935 12 1.1 925.9 0.5X +2 units w/o interval 943 947 3 1.1 943.4 0.5X +3 units w/ interval 1089 1116 31 0.9 1089.0 0.4X +3 units w/o interval 1105 1108 3 0.9 1105.1 0.4X +4 units w/ interval 1260 1261 1 0.8 1260.4 0.4X +4 units w/o interval 1276 1277 1 0.8 1275.9 0.4X +5 units w/ interval 1436 1445 11 0.7 1435.6 0.3X +5 units w/o interval 1455 1463 6 0.7 1455.5 0.3X +6 units w/ interval 1634 1639 4 0.6 1634.4 0.3X +6 units w/o interval 1642 1644 3 0.6 1641.7 0.3X +7 units w/ interval 1829 1838 8 0.5 1828.6 0.3X +7 units w/o interval 1850 1853 4 0.5 1849.5 0.3X +8 units w/ interval 2065 2070 5 0.5 2065.4 0.2X +8 units w/o interval 2070 2090 21 0.5 2070.0 0.2X +9 units w/ interval 2279 2290 10 0.4 2278.7 0.2X +9 units w/o interval 2276 2285 8 0.4 2275.7 0.2X + diff --git a/sql/core/benchmarks/IntervalBenchmark-results.txt b/sql/core/benchmarks/IntervalBenchmark-results.txt new file mode 100644 index 0000000000..9010b980c0 --- /dev/null +++ b/sql/core/benchmarks/IntervalBenchmark-results.txt @@ -0,0 +1,25 @@ +Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.15 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +cast strings to intervals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +string w/ interval 420 435 18 2.4 419.8 1.0X +string w/o interval 359 365 10 2.8 358.7 1.2X +1 units w/ interval 752 759 8 1.3 752.0 0.6X +1 units w/o interval 762 766 4 1.3 762.0 0.6X +2 units w/ interval 961 970 8 1.0 960.7 0.4X +2 units w/o interval 970 976 9 1.0 970.2 0.4X +3 units w/ interval 1130 1136 7 0.9 1130.4 0.4X +3 units w/o interval 1150 1158 9 0.9 1150.3 0.4X +4 units w/ interval 1333 1336 3 0.7 1333.5 0.3X +4 units w/o interval 1354 1359 4 0.7 1354.5 0.3X +5 units w/ interval 1523 1525 2 0.7 1523.3 0.3X +5 units w/o interval 1549 1551 3 0.6 1549.4 0.3X +6 units w/ interval 1661 1663 2 0.6 1660.8 0.3X +6 units w/o interval 1691 1704 13 0.6 1691.2 0.2X +7 units w/ interval 1811 1817 8 0.6 1810.6 0.2X +7 units w/o interval 1853 1854 1 0.5 1853.2 0.2X +8 units w/ interval 2029 2037 8 0.5 2028.7 0.2X +8 units w/o interval 2075 2075 1 0.5 2074.5 0.2X +9 units w/ interval 2170 2175 5 0.5 2170.0 0.2X +9 units w/o interval 2204 2212 8 0.5 2203.6 0.2X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala new file mode 100644 index 0000000000..4c1c75b815 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import scala.collection.mutable.ListBuffer + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.Column +import org.apache.spark.sql.SaveMode.Overwrite +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf + +/** + * Synthetic benchmark for interval functions. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/IntervalBenchmark-results.txt". + * }}} + */ +object IntervalBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + + private def doBenchmark(cardinality: Long, exprs: Column*): Unit = { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + spark + .range(0, cardinality, 1, 1) + .select(exprs: _*) + .write + .format("noop") + .mode(Overwrite) + .save() + } + } + + private def addCase( + benchmark: Benchmark, + cardinality: Long, + name: String, + exprs: Column*): Unit = { + benchmark.addCase(name, numIters = 3) { _ => + doBenchmark(cardinality, exprs: _*) + } + } + + private def buildString(withPrefix: Boolean, units: Seq[String] = Seq.empty): Column = { + val init = lit(if (withPrefix) "interval" else "") :: + ($"id" % 10000).cast("string") :: + lit("years") :: Nil + + concat_ws(" ", (init ++ units.map(lit)): _*) + } + + private def addCase(benchmark: Benchmark, cardinality: Long, units: Seq[String]): Unit = { + Seq(true, false).foreach { withPrefix => + val expr = buildString(withPrefix, units).cast("interval") + val note = if (withPrefix) "w/ interval" else "w/o interval" + benchmark.addCase(s"${units.length + 1} units $note", numIters = 3) { _ => + doBenchmark(cardinality, expr) + } + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + val N = 1000000 + val timeUnits = Seq( + "13 months", "100 weeks", "9 days", "12 hours", + "5 minutes", "45 seconds", "123 milliseconds", "567 microseconds") + val intervalToTest = ListBuffer[String]() + + val benchmark = new Benchmark("cast strings to intervals", N, output = output) + addCase(benchmark, N, "string w/ interval", buildString(true, timeUnits)) + addCase(benchmark, N, "string w/o interval", buildString(false, timeUnits)) + addCase(benchmark, N, intervalToTest) // Only years + + for (unit <- timeUnits) { + intervalToTest.append(unit) + addCase(benchmark, N, intervalToTest) + } + + benchmark.run() + } +}