[SPARK-25657][SQL][TEST] Refactor HashBenchmark to use main method

## What changes were proposed in this pull request?

Refactor `HashBenchmark` to use main method.
1. use `spark-submit`:
```console
bin/spark-submit --class  org.apache.spark.sql.HashBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar
```

2. Generate benchmark result:
```console
SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain org.apache.spark.sql.HashBenchmark"
```

## How was this patch tested?
manual tests

Closes #22651 from wangyum/SPARK-25657.

Lead-authored-by: Yuming Wang <wgyumg@gmail.com>
Co-authored-by: Yuming Wang <yumwang@ebay.com>
Co-authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
Yuming Wang 2018-10-07 09:49:37 -07:00 committed by Dongjoon Hyun
parent b1328cc58e
commit 669ade3a8e
No known key found for this signature in database
GPG key ID: EDA00CE834F0FC5C
2 changed files with 129 additions and 93 deletions

View file

@ -0,0 +1,70 @@
================================================================================================
single ints
================================================================================================
OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 5615 / 5616 95.6 10.5 1.0X
codegen version 8400 / 8407 63.9 15.6 0.7X
codegen version 64-bit 8139 / 8145 66.0 15.2 0.7X
codegen HiveHash version 7213 / 7348 74.4 13.4 0.8X
================================================================================================
single longs
================================================================================================
OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 6053 / 6054 88.7 11.3 1.0X
codegen version 9367 / 9369 57.3 17.4 0.6X
codegen version 64-bit 8041 / 8051 66.8 15.0 0.8X
codegen HiveHash version 7546 / 7575 71.1 14.1 0.8X
================================================================================================
normal
================================================================================================
OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 3181 / 3182 0.7 1517.0 1.0X
codegen version 2403 / 2403 0.9 1145.7 1.3X
codegen version 64-bit 915 / 916 2.3 436.2 3.5X
codegen HiveHash version 4505 / 4527 0.5 2148.3 0.7X
================================================================================================
array
================================================================================================
OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 1828 / 1844 0.1 13946.1 1.0X
codegen version 3678 / 3804 0.0 28058.2 0.5X
codegen version 64-bit 2925 / 2931 0.0 22317.8 0.6X
codegen HiveHash version 1216 / 1217 0.1 9280.0 1.5X
================================================================================================
map
================================================================================================
OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 0 / 0 44.3 22.6 1.0X
codegen version 176 / 176 0.0 42978.8 0.0X
codegen version 64-bit 173 / 175 0.0 42214.3 0.0X
codegen HiveHash version 44 / 44 0.1 10659.9 0.0X

View file

@ -17,7 +17,7 @@
package org.apache.spark.sql
import org.apache.spark.benchmark.Benchmark
import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection
@ -26,94 +26,87 @@ import org.apache.spark.sql.types._
/**
* Benchmark for the previous interpreted hash function(InternalRow.hashCode) vs codegened
* hash expressions (Murmur3Hash/xxHash64).
* To run this benchmark:
* {{{
* 1. without sbt:
* bin/spark-submit --class <this class> --jars <spark core test jar> <spark catalyst test jar>
* 2. build/sbt "catalyst/test:runMain <this class>"
* 3. generate result:
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/test:runMain <this class>"
* Results will be written to "benchmarks/HashBenchmark-results.txt".
* }}}
*/
object HashBenchmark {
object HashBenchmark extends BenchmarkBase {
def test(name: String, schema: StructType, numRows: Int, iters: Int): Unit = {
val generator = RandomDataGenerator.forType(schema, nullable = false).get
val encoder = RowEncoder(schema)
val attrs = schema.toAttributes
val safeProjection = GenerateSafeProjection.generate(attrs, attrs)
runBenchmark(name) {
val generator = RandomDataGenerator.forType(schema, nullable = false).get
val encoder = RowEncoder(schema)
val attrs = schema.toAttributes
val safeProjection = GenerateSafeProjection.generate(attrs, attrs)
val rows = (1 to numRows).map(_ =>
// The output of encoder is UnsafeRow, use safeProjection to turn in into safe format.
safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy()
).toArray
val rows = (1 to numRows).map(_ =>
// The output of encoder is UnsafeRow, use safeProjection to turn in into safe format.
safeProjection(encoder.toRow(generator().asInstanceOf[Row])).copy()
).toArray
val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong)
benchmark.addCase("interpreted version") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += rows(i).hashCode()
i += 1
val benchmark = new Benchmark("Hash For " + name, iters * numRows.toLong, output = output)
benchmark.addCase("interpreted version") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += rows(i).hashCode()
i += 1
}
}
}
}
val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs)
benchmark.addCase("codegen version") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += getHashCode(rows(i)).getInt(0)
i += 1
val getHashCode = UnsafeProjection.create(new Murmur3Hash(attrs) :: Nil, attrs)
benchmark.addCase("codegen version") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += getHashCode(rows(i)).getInt(0)
i += 1
}
}
}
}
val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs)
benchmark.addCase("codegen version 64-bit") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += getHashCode64b(rows(i)).getInt(0)
i += 1
val getHashCode64b = UnsafeProjection.create(new XxHash64(attrs) :: Nil, attrs)
benchmark.addCase("codegen version 64-bit") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += getHashCode64b(rows(i)).getInt(0)
i += 1
}
}
}
}
val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs)
benchmark.addCase("codegen HiveHash version") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += getHiveHashCode(rows(i)).getInt(0)
i += 1
val getHiveHashCode = UnsafeProjection.create(new HiveHash(attrs) :: Nil, attrs)
benchmark.addCase("codegen HiveHash version") { _: Int =>
var sum = 0
for (_ <- 0L until iters) {
var i = 0
while (i < numRows) {
sum += getHiveHashCode(rows(i)).getInt(0)
i += 1
}
}
}
}
benchmark.run()
benchmark.run()
}
}
def main(args: Array[String]): Unit = {
override def runBenchmarkSuite(): Unit = {
val singleInt = new StructType().add("i", IntegerType)
/*
Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
Hash For single ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 3262 / 3267 164.6 6.1 1.0X
codegen version 6448 / 6718 83.3 12.0 0.5X
codegen version 64-bit 6088 / 6154 88.2 11.3 0.5X
codegen HiveHash version 4732 / 4745 113.5 8.8 0.7X
*/
test("single ints", singleInt, 1 << 15, 1 << 14)
val singleLong = new StructType().add("i", LongType)
/*
Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
Hash For single longs: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 3716 / 3726 144.5 6.9 1.0X
codegen version 7706 / 7732 69.7 14.4 0.5X
codegen version 64-bit 6370 / 6399 84.3 11.9 0.6X
codegen HiveHash version 4924 / 5026 109.0 9.2 0.8X
*/
test("single longs", singleLong, 1 << 15, 1 << 14)
val normal = new StructType()
@ -131,45 +124,18 @@ object HashBenchmark {
.add("binary", BinaryType)
.add("date", DateType)
.add("timestamp", TimestampType)
/*
Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
Hash For normal: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 2985 / 3013 0.7 1423.4 1.0X
codegen version 2422 / 2434 0.9 1155.1 1.2X
codegen version 64-bit 856 / 920 2.5 408.0 3.5X
codegen HiveHash version 4501 / 4979 0.5 2146.4 0.7X
*/
test("normal", normal, 1 << 10, 1 << 11)
val arrayOfInt = ArrayType(IntegerType)
val array = new StructType()
.add("array", arrayOfInt)
.add("arrayOfArray", ArrayType(arrayOfInt))
/*
Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
Hash For array: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 3100 / 3555 0.0 23651.8 1.0X
codegen version 5779 / 5865 0.0 44088.4 0.5X
codegen version 64-bit 4738 / 4821 0.0 36151.7 0.7X
codegen HiveHash version 2200 / 2246 0.1 16785.9 1.4X
*/
test("array", array, 1 << 8, 1 << 9)
val mapOfInt = MapType(IntegerType, IntegerType)
val map = new StructType()
.add("map", mapOfInt)
.add("mapOfMap", MapType(IntegerType, mapOfInt))
/*
Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz
Hash For map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
interpreted version 0 / 0 48.1 20.8 1.0X
codegen version 257 / 275 0.0 62768.7 0.0X
codegen version 64-bit 226 / 240 0.0 55224.5 0.0X
codegen HiveHash version 89 / 96 0.0 21708.8 0.0X
*/
test("map", map, 1 << 6, 1 << 6)
}
}