[SPARK-32822][SQL] Change the number of partitions to zero when a range is empty with WholeStageCodegen disabled or falled back
### What changes were proposed in this pull request? This PR changes the behavior of RangeExec with WholeStageCodegen disabled or falled back to change the number of partitions to zero when a range is empty. In the current master, if WholeStageCodegen effects, the number of partitions of an empty range will be changed to zero. ``` spark.range(1, 1, 1, 1000).rdd.getNumPartitions res0: Int = 0 ``` But it doesn't if WholeStageCodegen is disabled or falled back. ``` spark.conf.set("spark.sql.codegen.wholeStage", false) spark.range(1, 1, 1, 1000).rdd.getNumPartitions res2: Int = 1000 ``` ### Why are the changes needed? To archive better performance even though WholeStageCodegen disabled or falled back. ### Does this PR introduce _any_ user-facing change? Yes. the number of partitions gotten with `getNumPartitions` for an empty range will be changed when WholeStageCodegen is disabled. ### How was this patch tested? New test. Closes #29681 from sarutak/zero-size-range. Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com> Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
This commit is contained in:
parent
a22871f50a
commit
5f468cc21e
|
@ -371,6 +371,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
|
|||
val step: Long = range.step
|
||||
val numSlices: Int = range.numSlices.getOrElse(sparkContext.defaultParallelism)
|
||||
val numElements: BigInt = range.numElements
|
||||
val isEmptyRange: Boolean = start == end || (start < end ^ 0 < step)
|
||||
|
||||
override val output: Seq[Attribute] = range.output
|
||||
|
||||
|
@ -396,7 +397,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
|
|||
}
|
||||
|
||||
override def inputRDDs(): Seq[RDD[InternalRow]] = {
|
||||
val rdd = if (start == end || (start < end ^ 0 < step)) {
|
||||
val rdd = if (isEmptyRange) {
|
||||
new EmptyRDD[InternalRow](sqlContext.sparkContext)
|
||||
} else {
|
||||
sqlContext.sparkContext.parallelize(0 until numSlices, numSlices).map(i => InternalRow(i))
|
||||
|
@ -562,12 +563,16 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
|
|||
|
||||
protected override def doExecute(): RDD[InternalRow] = {
|
||||
val numOutputRows = longMetric("numOutputRows")
|
||||
if (isEmptyRange) {
|
||||
new EmptyRDD[InternalRow](sqlContext.sparkContext)
|
||||
} else {
|
||||
sqlContext
|
||||
.sparkContext
|
||||
.parallelize(0 until numSlices, numSlices)
|
||||
.mapPartitionsWithIndex { (i, _) =>
|
||||
val partitionStart = (i * numElements) / numSlices * step + start
|
||||
val partitionEnd = (((i + 1) * numElements) / numSlices) * step + start
|
||||
|
||||
def getSafeMargin(bi: BigInt): Long =
|
||||
if (bi.isValidLong) {
|
||||
bi.toLong
|
||||
|
@ -576,6 +581,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
|
|||
} else {
|
||||
Long.MinValue
|
||||
}
|
||||
|
||||
val safePartitionStart = getSafeMargin(partitionStart)
|
||||
val safePartitionEnd = getSafeMargin(partitionEnd)
|
||||
val rowSize = UnsafeRow.calculateBitSetWidthInBytes(1) + LongType.defaultSize
|
||||
|
@ -615,6 +621,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
|
|||
new InterruptibleIterator(taskContext, iter)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override def simpleString(maxFields: Int): String = {
|
||||
s"Range ($start, $end, step=$step, splits=$numSlices)"
|
||||
|
|
|
@ -994,6 +994,13 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
testWithWholeStageCodegenOnAndOff("Change the number of partitions to zero " +
|
||||
"when a range is empty") { _ =>
|
||||
val range = spark.range(1, 1, 1, 1000)
|
||||
val numPartitions = range.rdd.getNumPartitions
|
||||
assert(numPartitions == 0)
|
||||
}
|
||||
}
|
||||
|
||||
// Used for unit-testing EnsureRequirements
|
||||
|
|
Loading…
Reference in a new issue