[SPARK-3197] [SQL] Reduce the Expression tree object creations for aggregation function (min/max)
Aggregation function min/max in catalyst will create expression tree for each single row, however, the expression tree creation is quite expensive in a multithreading env currently. Hence we got a very bad performance for the min/max. Here is the benchmark that I've done in my local. Master | Previous Result (ms) | Current Result (ms) ------------ | ------------- | ------------- local | 3645 | 3416 local[6] | 3602 | 1002 The Benchmark source code. ``` case class Record(key: Int, value: Int) object TestHive2 extends HiveContext(new SparkContext("local[6]", "TestSQLContext", new SparkConf())) object DataPrepare extends App { import TestHive2._ val rdd = sparkContext.parallelize((1 to 10000000).map(i => Record(i % 3000, i)), 12) runSqlHive("SHOW TABLES") runSqlHive("DROP TABLE if exists a") runSqlHive("DROP TABLE if exists result") rdd.registerAsTable("records") runSqlHive("""CREATE TABLE a (key INT, value INT) | ROW FORMAT SERDE | 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe' | STORED AS RCFILE """.stripMargin) runSqlHive("""CREATE TABLE result (key INT, value INT) | ROW FORMAT SERDE | 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe' | STORED AS RCFILE """.stripMargin) hql(s"""from records | insert into table a | select key, value """.stripMargin) } object PerformanceTest extends App { import TestHive2._ hql("SHOW TABLES") hql("set spark.sql.shuffle.partitions=12") val cmd = "select min(value), max(value) from a group by key" val results = ("Result1", benchmark(cmd)) :: ("Result2", benchmark(cmd)) :: ("Result3", benchmark(cmd)) :: Nil results.foreach { case (prompt, result) => { println(s"$prompt: took ${result._1} ms (${result._2} records)") } } def benchmark(cmd: String) = { val begin = System.currentTimeMillis() val count = hql(cmd).count val end = System.currentTimeMillis() ((end - begin), count) } } ``` Author: Cheng Hao <hao.cheng@intel.com> Closes #2113 from chenghao-intel/aggregation_expression_optimization and squashes the following commits: db40395 [Cheng Hao] remove the transient and add val for the expression property d56167d [Cheng Hao] Reduce the Expressions creation
This commit is contained in:
parent
3b5eb7083d
commit
4238c17dc9
|
@ -105,17 +105,18 @@ case class Min(child: Expression) extends PartialAggregate with trees.UnaryNode[
|
|||
case class MinFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
|
||||
def this() = this(null, null) // Required for serialization.
|
||||
|
||||
var currentMin: Any = _
|
||||
val currentMin: MutableLiteral = MutableLiteral(null, expr.dataType)
|
||||
val cmp = GreaterThan(currentMin, expr)
|
||||
|
||||
override def update(input: Row): Unit = {
|
||||
if (currentMin == null) {
|
||||
currentMin = expr.eval(input)
|
||||
} else if(GreaterThan(Literal(currentMin, expr.dataType), expr).eval(input) == true) {
|
||||
currentMin = expr.eval(input)
|
||||
if (currentMin.value == null) {
|
||||
currentMin.value = expr.eval(input)
|
||||
} else if(cmp.eval(input) == true) {
|
||||
currentMin.value = expr.eval(input)
|
||||
}
|
||||
}
|
||||
|
||||
override def eval(input: Row): Any = currentMin
|
||||
override def eval(input: Row): Any = currentMin.value
|
||||
}
|
||||
|
||||
case class Max(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
|
||||
|
@ -135,17 +136,18 @@ case class Max(child: Expression) extends PartialAggregate with trees.UnaryNode[
|
|||
case class MaxFunction(expr: Expression, base: AggregateExpression) extends AggregateFunction {
|
||||
def this() = this(null, null) // Required for serialization.
|
||||
|
||||
var currentMax: Any = _
|
||||
val currentMax: MutableLiteral = MutableLiteral(null, expr.dataType)
|
||||
val cmp = LessThan(currentMax, expr)
|
||||
|
||||
override def update(input: Row): Unit = {
|
||||
if (currentMax == null) {
|
||||
currentMax = expr.eval(input)
|
||||
} else if(LessThan(Literal(currentMax, expr.dataType), expr).eval(input) == true) {
|
||||
currentMax = expr.eval(input)
|
||||
if (currentMax.value == null) {
|
||||
currentMax.value = expr.eval(input)
|
||||
} else if(cmp.eval(input) == true) {
|
||||
currentMax.value = expr.eval(input)
|
||||
}
|
||||
}
|
||||
|
||||
override def eval(input: Row): Any = currentMax
|
||||
override def eval(input: Row): Any = currentMax.value
|
||||
}
|
||||
|
||||
case class Count(child: Expression) extends PartialAggregate with trees.UnaryNode[Expression] {
|
||||
|
@ -350,7 +352,7 @@ case class AverageFunction(expr: Expression, base: AggregateExpression)
|
|||
private val zero = Cast(Literal(0), expr.dataType)
|
||||
|
||||
private var count: Long = _
|
||||
private val sum = MutableLiteral(zero.eval(EmptyRow))
|
||||
private val sum = MutableLiteral(zero.eval(null), expr.dataType)
|
||||
private val sumAsDouble = Cast(sum, DoubleType)
|
||||
|
||||
private def addFunction(value: Any) = Add(sum, Literal(value))
|
||||
|
@ -423,7 +425,7 @@ case class SumFunction(expr: Expression, base: AggregateExpression) extends Aggr
|
|||
|
||||
private val zero = Cast(Literal(0), expr.dataType)
|
||||
|
||||
private val sum = MutableLiteral(zero.eval(null))
|
||||
private val sum = MutableLiteral(zero.eval(null), expr.dataType)
|
||||
|
||||
private val addFunction = Add(sum, Coalesce(Seq(expr, zero)))
|
||||
|
||||
|
|
|
@ -61,11 +61,10 @@ case class Literal(value: Any, dataType: DataType) extends LeafExpression {
|
|||
}
|
||||
|
||||
// TODO: Specialize
|
||||
case class MutableLiteral(var value: Any, nullable: Boolean = true) extends LeafExpression {
|
||||
case class MutableLiteral(var value: Any, dataType: DataType, nullable: Boolean = true)
|
||||
extends LeafExpression {
|
||||
type EvaluatedType = Any
|
||||
|
||||
val dataType = Literal(value).dataType
|
||||
|
||||
def update(expression: Expression, input: Row) = {
|
||||
value = expression.eval(input)
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue