[SPARK-12898] Consider having dummyCallSite for HiveTableScan
Currently, HiveTableScan runs with getCallSite which is really expensive and shows up when scanning through large table with partitions (e.g TPC-DS) which slows down the overall runtime of the job. It would be good to consider having dummyCallSite in HiveTableScan. Author: Rajesh Balamohan <rbalamohan@apache.org> Closes #10825 from rajeshbalamohan/SPARK-12898.
This commit is contained in:
parent
e75e340a40
commit
ab4a6bfd11
|
@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions._
|
|||
import org.apache.spark.sql.execution._
|
||||
import org.apache.spark.sql.hive._
|
||||
import org.apache.spark.sql.types.{BooleanType, DataType}
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
/**
|
||||
* The Hive table scan operator. Column and partition pruning are both handled.
|
||||
|
@ -133,12 +134,18 @@ case class HiveTableScan(
|
|||
}
|
||||
|
||||
protected override def doExecute(): RDD[InternalRow] = {
|
||||
// Using dummyCallSite, as getCallSite can turn out to be expensive with
|
||||
// with multiple partitions.
|
||||
val rdd = if (!relation.hiveQlTable.isPartitioned) {
|
||||
Utils.withDummyCallSite(sqlContext.sparkContext) {
|
||||
hadoopReader.makeRDDForTable(relation.hiveQlTable)
|
||||
}
|
||||
} else {
|
||||
Utils.withDummyCallSite(sqlContext.sparkContext) {
|
||||
hadoopReader.makeRDDForPartitionedTable(
|
||||
prunePartitions(relation.getHiveQlPartitions(partitionPruningPred)))
|
||||
}
|
||||
}
|
||||
rdd.mapPartitionsInternal { iter =>
|
||||
val proj = UnsafeProjection.create(schema)
|
||||
iter.map(proj)
|
||||
|
|
Loading…
Reference in a new issue