diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 9f2f4ef5c6..abad8665a0 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -55,6 +55,8 @@ license: | * and the method `spark.catalog.refreshTable` In Spark 3.1 and earlier, table refreshing leaves dependents uncached. + - In Spark 3.2, the usage of `count(tblName.*)` is blocked to avoid producing ambiguous results. Because `count(*)` and `count(tblName.*)` will output differently if there is any null values. To restore the behavior before Spark 3.2, you can set `spark.sql.legacy.allowStarWithSingleTableIdentifierInCount` to `true`. + ## Upgrading from Spark SQL 3.0 to 3.1 - In Spark 3.1, statistical aggregation function includes `std`, `stddev`, `stddev_samp`, `variance`, `var_samp`, `skewness`, `kurtosis`, `covar_samp`, `corr` will return `NULL` instead of `Double.NaN` when `DivideByZero` occurs during expression evaluation, for example, when `stddev_samp` applied on a single element set. In Spark version 3.0 and earlier, it will return `Double.NaN` in such case. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.statisticalAggregate` to `true`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 9f998aaeeb..5a9f0d58c0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1774,6 +1774,21 @@ class Analyzer(override val catalogManager: CatalogManager) def expandStarExpression(expr: Expression, child: LogicalPlan): Expression = { expr.transformUp { case f1: UnresolvedFunction if containsStar(f1.arguments) => + // SPECIAL CASE: We want to block count(tblName.*) because in spark, count(tblName.*) will + // be expanded while count(*) will be converted to count(1). They will produce different + // results and confuse users if there is any null values. For count(t1.*, t2.*), it is + // still allowed, since it's well-defined in spark. + if (!conf.allowStarWithSingleTableIdentifierInCount && + f1.name.database.isEmpty && + f1.name.funcName == "count" && + f1.arguments.length == 1) { + f1.arguments.foreach { + case u: UnresolvedStar if u.isQualifiedByTable(child, resolver) => + throw QueryCompilationErrors + .singleTableStarInCountNotAllowedError(u.target.get.mkString(".")) + case _ => // do nothing + } + } f1.copy(arguments = f1.arguments.flatMap { case s: Star => s.expand(child, resolver) case o => o :: Nil diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 1b83cbfa03..c10b796285 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -336,6 +336,10 @@ case class UnresolvedStar(target: Option[Seq[String]]) extends Star with Unevalu nameParts.corresponds(qualifierList)(resolver) } + def isQualifiedByTable(input: LogicalPlan, resolver: Resolver): Boolean = { + target.exists(nameParts => input.output.exists(matchedQualifier(_, nameParts, resolver))) + } + override def expand( input: LogicalPlan, resolver: Resolver): Seq[NamedExpression] = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 57b4802968..09617bfc2a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -251,6 +251,11 @@ private[spark] object QueryCompilationErrors { new AnalysisException(s"Invalid usage of '*' in $prettyName") } + def singleTableStarInCountNotAllowedError(targetString: String): Throwable = { + new AnalysisException(s"count($targetString.*) is not allowed. " + + "Please use count(*) or expand the columns manually, e.g. count(col1, col2)") + } + def orderByPositionRangeError(index: Int, size: Int, t: TreeNode[_]): Throwable = { new AnalysisException(s"ORDER BY position $index is not in select list " + s"(valid range is [1, $size])", t.origin.line, t.origin.startPosition) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 0fa60f114e..ab32fa6889 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1559,6 +1559,14 @@ object SQLConf { .booleanConf .createWithDefault(false) + val ALLOW_STAR_WITH_SINGLE_TABLE_IDENTIFIER_IN_COUNT = + buildConf("spark.sql.legacy.allowStarWithSingleTableIdentifierInCount") + .internal() + .doc("When true, the SQL function 'count' is allowed to take single 'tblName.*' as parameter") + .version("3.2") + .booleanConf + .createWithDefault(false) + val USE_CURRENT_SQL_CONFIGS_FOR_VIEW = buildConf("spark.sql.legacy.useCurrentConfigsForView") .internal() @@ -3577,6 +3585,9 @@ class SQLConf extends Serializable with Logging { def storeAnalyzedPlanForView: Boolean = getConf(SQLConf.STORE_ANALYZED_PLAN_FOR_VIEW) + def allowStarWithSingleTableIdentifierInCount: Boolean = + getConf(SQLConf.ALLOW_STAR_WITH_SINGLE_TABLE_IDENTIFIER_IN_COUNT) + def starSchemaDetection: Boolean = getConf(STARSCHEMA_DETECTION) def starSchemaFTRatio: Double = getConf(STARSCHEMA_FACT_TABLE_RATIO) diff --git a/sql/core/src/test/resources/sql-tests/inputs/count.sql b/sql/core/src/test/resources/sql-tests/inputs/count.sql index b3e37d1ee8..ae27c1cae3 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/count.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/count.sql @@ -19,11 +19,11 @@ SELECT FROM testData; -- count with multiple expressions -SELECT count(a, b), count(b, a), count(testData.*) FROM testData; +SELECT count(a, b), count(b, a), count(testData.*, testData.*) FROM testData; -- distinct count with multiple expressions SELECT - count(DISTINCT a, b), count(DISTINCT b, a), count(DISTINCT *), count(DISTINCT testData.*) + count(DISTINCT a, b), count(DISTINCT b, a), count(DISTINCT *), count(DISTINCT testData.*, testData.*) FROM testData; -- distinct count with multiple literals @@ -43,3 +43,11 @@ SELECT count() FROM testData; -- count without expressions set spark.sql.legacy.allowParameterlessCount=false; SELECT count() FROM testData; + +-- legacy behavior: allow count(testData.*) +set spark.sql.legacy.allowStarWithSingleTableIdentifierInCount=true; +SELECT count(testData.*) FROM testData; + +-- count with a single tblName.* as parameter +set spark.sql.legacy.allowStarWithSingleTableIdentifierInCount=false; +SELECT count(testData.*) FROM testData; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql index cc07b00cc3..fd9fe49401 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql @@ -743,20 +743,25 @@ select * from a left join b on i = x and i = y and x = i; -- -- test NULL behavior of whole-row Vars, per bug #5025 -- -select t1.q2, count(t2.*) +--- [SPARK-34199] changed the `count(t2.*)` to `count(t2.q1, t2.q2)` since we have +--- blocked `count(tblName.*)`. Besides this, in pgsql, `count(t2.*)` of outter join +--- means how many matching rows produced by t2 while Spark SQL doesn't have this semantic. +--- So here we use `count(t2.q1, t2.q2)` instead of `count(1)` to keep the query output +--- unchanged. +select t1.q2, count(t2.q1, t2.q2) from int8_tbl t1 left join int8_tbl t2 on (t1.q2 = t2.q1) group by t1.q2 order by 1; -select t1.q2, count(t2.*) +select t1.q2, count(t2.q1, t2.q2) from int8_tbl t1 left join (select * from int8_tbl) t2 on (t1.q2 = t2.q1) group by t1.q2 order by 1; -- [SPARK-28330] Enhance query limit --- select t1.q2, count(t2.*) +-- select t1.q2, count(t2.q1, t2.q2) -- from int8_tbl t1 left join (select * from int8_tbl offset 0) t2 on (t1.q2 = t2.q1) -- group by t1.q2 order by 1; -select t1.q2, count(t2.*) +select t1.q2, count(t2.q1, t2.q2) from int8_tbl t1 left join (select q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2 on (t1.q2 = t2.q1) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql index 77bcfe7e10..b9ea35038f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql @@ -735,20 +735,20 @@ select * from a left join b on udf(i) = x and i = udf(y) and udf(x) = udf(i); -- -- test NULL behavior of whole-row Vars, per bug #5025 -- -select udf(t1.q2), udf(count(t2.*)) +select udf(t1.q2), udf(count(t2.q1, t2.q2)) from int8_tbl t1 left join int8_tbl t2 on (udf(udf(t1.q2)) = t2.q1) group by udf(t1.q2) order by 1; -select udf(udf(t1.q2)), udf(count(t2.*)) +select udf(udf(t1.q2)), udf(count(t2.q1, t2.q2)) from int8_tbl t1 left join (select * from int8_tbl) t2 on (udf(udf(t1.q2)) = udf(t2.q1)) group by udf(udf(t1.q2)) order by 1; -- [SPARK-28330] Enhance query limit --- select t1.q2, count(t2.*) +-- select t1.q2, count(t2.q1, t2.q2) -- from int8_tbl t1 left join (select * from int8_tbl offset 0) t2 on (t1.q2 = t2.q1) -- group by t1.q2 order by 1; -select udf(t1.q2) as q2, udf(udf(count(t2.*))) +select udf(t1.q2) as q2, udf(udf(count(t2.q1, t2.q2))) from int8_tbl t1 left join (select udf(q1) as q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2 on (udf(t1.q2) = udf(t2.q1)) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-count.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-count.sql index 6ab207b93f..82058d43f8 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-count.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-count.sql @@ -20,9 +20,9 @@ SELECT FROM testData; -- count with multiple expressions -SELECT udf(count(a, b)), udf(count(b, a)), udf(count(testData.*)) FROM testData; +SELECT udf(count(a, b)), udf(count(b, a)), udf(count(testData.*, testData.*)) FROM testData; -- distinct count with multiple expressions SELECT - udf(count(DISTINCT a, b)), udf(count(DISTINCT b, a)), udf(count(DISTINCT *)), udf(count(DISTINCT testData.*)) + udf(count(DISTINCT a, b)), udf(count(DISTINCT b, a)), udf(count(DISTINCT *)), udf(count(DISTINCT testData.*, testData.*)) FROM testData; diff --git a/sql/core/src/test/resources/sql-tests/results/count.sql.out b/sql/core/src/test/resources/sql-tests/results/count.sql.out index 7f15bbd373..c0a04a1ddf 100644 --- a/sql/core/src/test/resources/sql-tests/results/count.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/count.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 17 +-- Number of queries: 21 -- !query @@ -38,19 +38,19 @@ struct +struct -- !query output 4 4 4 -- !query SELECT - count(DISTINCT a, b), count(DISTINCT b, a), count(DISTINCT *), count(DISTINCT testData.*) + count(DISTINCT a, b), count(DISTINCT b, a), count(DISTINCT *), count(DISTINCT testData.*, testData.*) FROM testData -- !query schema -struct +struct -- !query output 3 3 3 3 @@ -150,3 +150,36 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException cannot resolve 'count()' due to data type mismatch: count requires at least one argument. If you have to call the function count without arguments, set the legacy configuration `spark.sql.legacy.allowParameterlessCount` as true; line 1 pos 7 + + +-- !query +set spark.sql.legacy.allowStarWithSingleTableIdentifierInCount=true +-- !query schema +struct +-- !query output +spark.sql.legacy.allowStarWithSingleTableIdentifierInCount true + + +-- !query +SELECT count(testData.*) FROM testData +-- !query schema +struct +-- !query output +4 + + +-- !query +set spark.sql.legacy.allowStarWithSingleTableIdentifierInCount=false +-- !query schema +struct +-- !query output +spark.sql.legacy.allowStarWithSingleTableIdentifierInCount false + + +-- !query +SELECT count(testData.*) FROM testData +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +count(testData.*) is not allowed. Please use count(*) or expand the columns manually, e.g. count(col1, col2) diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out index 20f4f6b1f2..ce27295b88 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out @@ -2729,7 +2729,7 @@ struct -- !query -select t1.q2, count(t2.*) +select t1.q2, count(t2.q1, t2.q2) from int8_tbl t1 left join int8_tbl t2 on (t1.q2 = t2.q1) group by t1.q2 order by 1 -- !query schema @@ -2742,7 +2742,7 @@ struct -- !query -select t1.q2, count(t2.*) +select t1.q2, count(t2.q1, t2.q2) from int8_tbl t1 left join (select * from int8_tbl) t2 on (t1.q2 = t2.q1) group by t1.q2 order by 1 -- !query schema @@ -2755,7 +2755,7 @@ struct -- !query -select t1.q2, count(t2.*) +select t1.q2, count(t2.q1, t2.q2) from int8_tbl t1 left join (select q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2 on (t1.q2 = t2.q1) diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out index 3a08cf275b..6fd4a067c7 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out @@ -2757,7 +2757,7 @@ struct -- !query -select udf(t1.q2), udf(count(t2.*)) +select udf(t1.q2), udf(count(t2.q1, t2.q2)) from int8_tbl t1 left join int8_tbl t2 on (udf(udf(t1.q2)) = t2.q1) group by udf(t1.q2) order by 1 -- !query schema @@ -2770,7 +2770,7 @@ struct -- !query -select udf(udf(t1.q2)), udf(count(t2.*)) +select udf(udf(t1.q2)), udf(count(t2.q1, t2.q2)) from int8_tbl t1 left join (select * from int8_tbl) t2 on (udf(udf(t1.q2)) = udf(t2.q1)) group by udf(udf(t1.q2)) order by 1 -- !query schema @@ -2783,7 +2783,7 @@ struct -- !query -select udf(t1.q2) as q2, udf(udf(count(t2.*))) +select udf(t1.q2) as q2, udf(udf(count(t2.q1, t2.q2))) from int8_tbl t1 left join (select udf(q1) as q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2 on (udf(t1.q2) = udf(t2.q1)) diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out index dd027d93cf..4bd8da6186 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out @@ -38,18 +38,18 @@ struct +struct -- !query output 4 4 4 -- !query SELECT - udf(count(DISTINCT a, b)), udf(count(DISTINCT b, a)), udf(count(DISTINCT *)), udf(count(DISTINCT testData.*)) + udf(count(DISTINCT a, b)), udf(count(DISTINCT b, a)), udf(count(DISTINCT *)), udf(count(DISTINCT testData.*, testData.*)) FROM testData -- !query schema -struct +struct -- !query output 3 3 3 3 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index 01b1508d03..4f64de4ae8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -155,6 +155,28 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { checkAnswer(testData.as("testData").select($"testData.*"), testData.collect().toSeq) } + test("SPARK-34199: star can be qualified by table name inside a non-count function") { + checkAnswer( + testData.as("testData").selectExpr("hash(testData.*)"), + testData.as("testData").selectExpr("hash(testData.key, testData.value)") + ) + } + + test("SPARK-34199: star cannot be qualified by table name inside a count function") { + val e = intercept[AnalysisException] { + testData.as("testData").selectExpr("count(testData.*)").collect() + } + assert(e.getMessage.contains( + "count(testData.*) is not allowed. Please use count(*) or expand the columns manually")) + } + + test("SPARK-34199: table star can be qualified inside a count function with multiple arguments") { + checkAnswer( + testData.as("testData").selectExpr("count(testData.*, testData.key)"), + testData.as("testData").selectExpr("count(testData.key, testData.value, testData.key)") + ) + } + test("+") { checkAnswer( testData2.select($"a" + 1),