[SPARK-19314][SS][CATALYST] Do not allow sort before aggregation in Structured Streaming plan
## What changes were proposed in this pull request? Sort in a streaming plan should be allowed only after a aggregation in complete mode. Currently it is incorrectly allowed when present anywhere in the plan. It gives unpredictable potentially incorrect results. ## How was this patch tested? New test Author: Tathagata Das <tathagata.das1565@gmail.com> Closes #16662 from tdas/SPARK-19314.
This commit is contained in:
parent
e20d9b1565
commit
552e5f0884
|
@ -87,7 +87,7 @@ object UnsupportedOperationChecker {
|
|||
* data.
|
||||
*/
|
||||
def containsCompleteData(subplan: LogicalPlan): Boolean = {
|
||||
val aggs = plan.collect { case a@Aggregate(_, _, _) if a.isStreaming => a }
|
||||
val aggs = subplan.collect { case a@Aggregate(_, _, _) if a.isStreaming => a }
|
||||
// Either the subplan has no streaming source, or it has aggregation with Complete mode
|
||||
!subplan.isStreaming || (aggs.nonEmpty && outputMode == InternalOutputModes.Complete)
|
||||
}
|
||||
|
|
|
@ -199,12 +199,17 @@ class UnsupportedOperationsSuite extends SparkFunSuite {
|
|||
_.intersect(_),
|
||||
streamStreamSupported = false)
|
||||
|
||||
// Sort: supported only on batch subplans and on aggregation + complete output mode
|
||||
// Sort: supported only on batch subplans and after aggregation on streaming plan + complete mode
|
||||
testUnaryOperatorInStreamingPlan("sort", Sort(Nil, true, _))
|
||||
assertSupportedInStreamingPlan(
|
||||
"sort - sort over aggregated data in Complete output mode",
|
||||
"sort - sort after aggregation in Complete output mode",
|
||||
streamRelation.groupBy()(Count("*")).sortBy(),
|
||||
Complete)
|
||||
assertNotSupportedInStreamingPlan(
|
||||
"sort - sort before aggregation in Complete output mode",
|
||||
streamRelation.sortBy().groupBy()(Count("*")),
|
||||
Complete,
|
||||
Seq("sort", "aggregat", "complete"))
|
||||
assertNotSupportedInStreamingPlan(
|
||||
"sort - sort over aggregated data in Update output mode",
|
||||
streamRelation.groupBy()(Count("*")).sortBy(),
|
||||
|
|
Loading…
Reference in a new issue