From e271664a01fd7dee63391890514d76262cad1bc1 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Mon, 2 Dec 2019 21:05:06 +0800 Subject: [PATCH] [MINOR][SQL] Rename config name to spark.sql.analyzer.failAmbiguousSelfJoin.enabled ### What changes were proposed in this pull request? add `.enabled` postfix to `spark.sql.analyzer.failAmbiguousSelfJoin`. ### Why are the changes needed? to follow the existing naming style ### Does this PR introduce any user-facing change? no ### How was this patch tested? not needed Closes #26694 from cloud-fan/conf. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- docs/sql-migration-guide.md | 2 +- .../org/apache/spark/sql/internal/SQLConf.scala | 4 ++-- .../scala/org/apache/spark/sql/Dataset.scala | 4 ++-- .../analysis/DetectAmbiguousSelfJoin.scala | 4 ++-- .../spark/sql/DataFrameSelfJoinSuite.scala | 16 ++++++++-------- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 3ea4839a81..5c2e9ba119 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -111,7 +111,7 @@ license: | - The result of `java.lang.Math`'s `log`, `log1p`, `exp`, `expm1`, and `pow` may vary across platforms. In Spark 3.0, the result of the equivalent SQL functions (including related SQL functions like `LOG10`) return values consistent with `java.lang.StrictMath`. In virtually all cases this makes no difference in the return value, and the difference is very small, but may not exactly match `java.lang.Math` on x86 platforms in cases like, for example, `log(3.0)`, whose value varies between `Math.log()` and `StrictMath.log()`. - - Since Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin` to `false`. + - Since Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: `val df1 = ...; val df2 = df1.filter(...);`, then `df1.join(df2, df1("a") > df2("a"))` returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and `df1("a")` is exactly the same as `df2("a")` in Spark. To restore the behavior before Spark 3.0, you can set `spark.sql.analyzer.failAmbiguousSelfJoin.enabled` to `false`. - Since Spark 3.0, `Cast` function processes string literals such as 'Infinity', '+Infinity', '-Infinity', 'NaN', 'Inf', '+Inf', '-Inf' in case insensitive manner when casting the literals to `Double` or `Float` type to ensure greater compatibility with other database systems. This behaviour change is illustrated in the table below: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 105b2a857e..e1b8192fba 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -875,8 +875,8 @@ object SQLConf { .booleanConf .createWithDefault(true) - val FAIL_AMBIGUOUS_SELF_JOIN = - buildConf("spark.sql.analyzer.failAmbiguousSelfJoin") + val FAIL_AMBIGUOUS_SELF_JOIN_ENABLED = + buildConf("spark.sql.analyzer.failAmbiguousSelfJoin.enabled") .doc("When true, fail the Dataset query if it contains ambiguous self-join.") .internal() .booleanConf diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index e1bca44dfc..1b75fccbdb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -229,7 +229,7 @@ class Dataset[T] private[sql]( case _ => queryExecution.analyzed } - if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN)) { + if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) { plan.setTagValue(Dataset.DATASET_ID_TAG, id) } plan @@ -1337,7 +1337,7 @@ class Dataset[T] private[sql]( private def addDataFrameIdToCol(expr: NamedExpression): NamedExpression = { val newExpr = expr transform { case a: AttributeReference - if sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN) => + if sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED) => val metadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(Dataset.DATASET_ID_KEY, id) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala index 5c3c735f03..614d6c2846 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala @@ -71,7 +71,7 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] { } override def apply(plan: LogicalPlan): LogicalPlan = { - if (!conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN)) return plan + if (!conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) return plan // We always remove the special metadata from `AttributeReference` at the end of this rule, so // Dataset column reference only exists in the root node via Dataset transformations like @@ -149,7 +149,7 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] { "to figure out which one. Please alias the Datasets with different names via " + "`Dataset.as` before joining them, and specify the column using qualified name, e.g. " + """`df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set """ + - s"${SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key} to false to disable this check.") + s"${SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key} to false to disable this check.") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala index 92f1e4306c..59b5dacb10 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala @@ -96,7 +96,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"id" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { // `df1("id") > df2("id")` is always false. checkAnswer(df1.join(df2, df1("id") > df2("id")), Nil) @@ -110,7 +110,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { } withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2, df1("id") > df2("id"))) } @@ -121,7 +121,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"id" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2, df1.colRegex("id") > df2.colRegex("id"))) } @@ -132,7 +132,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"a.b" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2, df1("a.b") > df2("a.c"))) } @@ -143,7 +143,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"id" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { // `df2("id")` actually points to the column of `df1`. checkAnswer(df1.join(df2).select(df2("id")), Seq(0, 0, 1, 1, 2, 2).map(Row(_))) @@ -157,7 +157,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { } withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2).select(df2("id"))) } @@ -170,7 +170,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df4 = spark.range(1) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { // `df2("id") < df3("id")` is always false checkAnswer(df1.join(df2).join(df3, df2("id") < df3("id")), Nil) @@ -196,7 +196,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { } withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2).join(df3, df2("id") < df3("id"))) assertAmbiguousSelfJoin(df1.join(df4).join(df2).select(df2("id")))