[SPARK-25589][SQL][TEST] Add BloomFilterBenchmark
## What changes were proposed in this pull request? This PR aims to add `BloomFilterBenchmark`. For ORC data source, Apache Spark has been supporting for a long time. For Parquet data source, it's expected to be added with next Parquet release update. ## How was this patch tested? Manual. ```scala SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.benchmark.BloomFilterBenchmark" ``` Closes #22605 from dongjoon-hyun/SPARK-25589. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
parent
928d0739c4
commit
1a5d83bed8
24
sql/core/benchmarks/BloomFilterBenchmark-results.txt
Normal file
24
sql/core/benchmarks/BloomFilterBenchmark-results.txt
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
================================================================================================
|
||||||
|
ORC Write
|
||||||
|
================================================================================================
|
||||||
|
|
||||||
|
OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
|
||||||
|
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||||
|
Write 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
|
||||||
|
------------------------------------------------------------------------------------------------
|
||||||
|
Without bloom filter 16765 / 17587 6.0 167.7 1.0X
|
||||||
|
With bloom filter 20060 / 20626 5.0 200.6 0.8X
|
||||||
|
|
||||||
|
|
||||||
|
================================================================================================
|
||||||
|
ORC Read
|
||||||
|
================================================================================================
|
||||||
|
|
||||||
|
OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
|
||||||
|
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||||
|
Read a row from 100M rows: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
|
||||||
|
------------------------------------------------------------------------------------------------
|
||||||
|
Without bloom filter 1857 / 1904 53.9 18.6 1.0X
|
||||||
|
With bloom filter 1399 / 1437 71.5 14.0 1.3X
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,87 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.execution.benchmark
|
||||||
|
|
||||||
|
import scala.util.Random
|
||||||
|
|
||||||
|
import org.apache.spark.benchmark.Benchmark
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Benchmark to measure read performance with Bloom filters.
|
||||||
|
*
|
||||||
|
* Currently, only ORC supports bloom filters, we will add Parquet BM as soon as it becomes
|
||||||
|
* available.
|
||||||
|
*
|
||||||
|
* To run this benchmark:
|
||||||
|
* {{{
|
||||||
|
* 1. without sbt: bin/spark-submit --class <this class> <spark sql test jar>
|
||||||
|
* 2. build/sbt "sql/test:runMain <this class>"
|
||||||
|
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
|
||||||
|
* Results will be written to "benchmarks/BloomFilterBenchmark-results.txt".
|
||||||
|
* }}}
|
||||||
|
*/
|
||||||
|
object BloomFilterBenchmark extends SqlBasedBenchmark {
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
private val scaleFactor = 100
|
||||||
|
private val N = scaleFactor * 1000 * 1000
|
||||||
|
private val df = spark.range(N).map(_ => Random.nextInt)
|
||||||
|
|
||||||
|
private def writeBenchmark(): Unit = {
|
||||||
|
withTempPath { dir =>
|
||||||
|
val path = dir.getCanonicalPath
|
||||||
|
|
||||||
|
runBenchmark(s"ORC Write") {
|
||||||
|
val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output)
|
||||||
|
benchmark.addCase("Without bloom filter") { _ =>
|
||||||
|
df.write.mode("overwrite").orc(path + "/withoutBF")
|
||||||
|
}
|
||||||
|
benchmark.addCase("With bloom filter") { _ =>
|
||||||
|
df.write.mode("overwrite")
|
||||||
|
.option("orc.bloom.filter.columns", "value").orc(path + "/withBF")
|
||||||
|
}
|
||||||
|
benchmark.run()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def readBenchmark(): Unit = {
|
||||||
|
withTempPath { dir =>
|
||||||
|
val path = dir.getCanonicalPath
|
||||||
|
|
||||||
|
df.write.orc(path + "/withoutBF")
|
||||||
|
df.write.option("orc.bloom.filter.columns", "value").orc(path + "/withBF")
|
||||||
|
|
||||||
|
runBenchmark(s"ORC Read") {
|
||||||
|
val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output)
|
||||||
|
benchmark.addCase("Without bloom filter") { _ =>
|
||||||
|
spark.read.orc(path + "/withoutBF").where("value = 0").count
|
||||||
|
}
|
||||||
|
benchmark.addCase("With bloom filter") { _ =>
|
||||||
|
spark.read.orc(path + "/withBF").where("value = 0").count
|
||||||
|
}
|
||||||
|
benchmark.run()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override def runBenchmarkSuite(): Unit = {
|
||||||
|
writeBenchmark()
|
||||||
|
readBenchmark()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue