[SPARK-34986][SQL] Make an error msg clearer when ordinal numbers in group-by refer to agg funcs

### What changes were proposed in this pull request? before when we use aggregate ordinal in group by expression and index position is a aggregate function, it will show error as ``` – !query select a, b, sum(b) from data group by 3 – !query schema struct<> – !query output org.apache.spark.sql.AnalysisException aggregate functions are not allowed in GROUP BY, but found sum(data.b) ``` It't not clear enough refactor this error message in this pr ### Why are the changes needed? refactor error message ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existed UT Closes #32089 from AngersZhuuuu/SPARK-34986. Lead-authored-by: Angerszhuuuu <angers.zhu@gmail.com> Co-authored-by: AngersZhuuuu <angers.zhu@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
2021-04-12 11:45:08 +09:00 · 2021-04-12 11:45:08 +09:00 · 03431d40eb
parent fd8081cd27
commit 03431d40eb
3 changed files with 26 additions and 10 deletions
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@ -1815,10 +1815,18 @@ class Analyzer(override val catalogManager: CatalogManager)
        expr: Expression,
        aggs: Seq[Expression]): Expression = expr match {
      case ordinal @ UnresolvedOrdinal(index) =>
-        if (index > 0 && index <= aggs.size) {
-          aggs(index - 1)
-        } else {
-          throw QueryCompilationErrors.groupByPositionRangeError(index, aggs.size, ordinal)
+        withPosition(ordinal) {
+          if (index > 0 && index <= aggs.size) {
+            val ordinalExpr = aggs(index - 1)
+            if (ordinalExpr.find(_.isInstanceOf[AggregateExpression]).nonEmpty) {
+              throw QueryCompilationErrors.groupByPositionRefersToAggregateFunctionError(
+                index, ordinalExpr)
+            } else {
+              ordinalExpr
+            }
+          } else {
+            throw QueryCompilationErrors.groupByPositionRangeError(index, aggs.size)
+          }
        }
      case gs: BaseGroupingSets =>
        gs.withNewChildren(gs.children.map(resolveGroupByExpressionOrdinal(_, aggs)))
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@ -268,9 +268,17 @@ private[spark] object QueryCompilationErrors {
      s"(valid range is [1, $size])", t.origin.line, t.origin.startPosition)
  }

-  def groupByPositionRangeError(index: Int, size: Int, t: TreeNode[_]): Throwable = {
+  def groupByPositionRefersToAggregateFunctionError(
+      index: Int,
+      expr: Expression): Throwable = {
+    new AnalysisException(s"GROUP BY $index refers to an expression that is or contains " +
+      "an aggregate function. Aggregate functions are not allowed in GROUP BY, " +
+      s"but got ${expr.sql}")
+  }
+
+  def groupByPositionRangeError(index: Int, size: Int): Throwable = {
    new AnalysisException(s"GROUP BY position $index is not in select list " +
-      s"(valid range is [1, $size])", t.origin.line, t.origin.startPosition)
+      s"(valid range is [1, $size])")
  }

  def generatorNotExpectedError(name: FunctionIdentifier, classCanonicalName: String): Throwable = {
--- a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out
@ -122,7 +122,7 @@ select a, b, sum(b) from data group by 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-aggregate functions are not allowed in GROUP BY, but found sum(data.b)
+GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got sum(data.b) AS `sum(b)`; line 1 pos 39


 -- !query
@ -131,7 +131,7 @@ select a, b, sum(b) + 2 from data group by 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-aggregate functions are not allowed in GROUP BY, but found (sum(data.b) + CAST(2 AS BIGINT))
+GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got (sum(data.b) + CAST(2 AS BIGINT)) AS `(sum(b) + 2)`; line 1 pos 43


 -- !query
@ -361,7 +361,7 @@ select a, b, count(1) from data group by a, 3
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-aggregate functions are not allowed in GROUP BY, but found count(1)
+GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got count(1) AS `count(1)`; line 1 pos 44


 -- !query
@ -379,7 +379,7 @@ select a, b, count(1) from data group by cube(1, 3)
 struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
-grouping expressions sequence is empty, and 'data.a' is not an aggregate function. Wrap '(count(1) AS `count(1)`)' in windowing function(s) or wrap 'data.a' in first() (or first_value) if you don't care which value you get.
+GROUP BY 3 refers to an expression that is or contains an aggregate function. Aggregate functions are not allowed in GROUP BY, but got count(1) AS `count(1)`; line 1 pos 49


 -- !query