[SPARK-19799][SQL] Support WITH clause in subqueries

## What changes were proposed in this pull request?

This PR  adds support of `WITH` clause within a subquery so this query becomes valid:
  ```
  SELECT max(c) FROM (
    WITH t AS (SELECT 1 AS c)
    SELECT * FROM t
  )
 ```

## How was this patch tested?

Added new UTs.

Closes #24831 from peter-toth/SPARK-19799-2.

Authored-by: Peter Toth <peter.toth@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
This commit is contained in:
Peter Toth 2019-07-04 07:34:02 -07:00 committed by Dongjoon Hyun
parent cec6a32904
commit cad440d1f5
5 changed files with 345 additions and 67 deletions

View file

@ -278,7 +278,7 @@ locationSpec
;
query
: ctes? queryNoWith
: ctes? queryTerm queryOrganization
;
insertInto
@ -380,10 +380,6 @@ dmlStatementNoWith
| fromClause multiInsertQueryBody+ #multiInsertQuery
;
queryNoWith
: queryTerm queryOrganization
;
queryOrganization
: (ORDER BY order+=sortItem (',' order+=sortItem)*)?
(CLUSTER BY clusterBy+=expression (',' clusterBy+=expression)*)?
@ -412,7 +408,7 @@ queryPrimary
| fromStatement #fromStmt
| TABLE multipartIdentifier #table
| inlineTable #inlineTableDefault1
| '(' queryNoWith ')' #subquery
| '(' query ')' #subquery
;
sortItem
@ -583,7 +579,7 @@ identifierComment
relationPrimary
: multipartIdentifier sample? tableAlias #tableName
| '(' queryNoWith ')' sample? tableAlias #aliasedQuery
| '(' query ')' sample? tableAlias #aliasedQuery
| '(' relation ')' sample? tableAlias #aliasedRelation
| inlineTable #inlineTableDefault2
| functionTable #tableValuedFunction

View file

@ -224,12 +224,13 @@ class Analyzer(
case other => other
}
def substituteCTE(plan: LogicalPlan, cteName: String, ctePlan: LogicalPlan): LogicalPlan = {
private def substituteCTE(
plan: LogicalPlan,
cteName: String,
ctePlan: LogicalPlan): LogicalPlan = {
plan resolveOperatorsUp {
case UnresolvedRelation(Seq(table)) if resolver(cteName, table) =>
ctePlan
case u: UnresolvedRelation =>
u
case other =>
// This cannot be done in ResolveSubquery because ResolveSubquery does not know the CTE.
other transformExpressions {

View file

@ -111,7 +111,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
* Create a top-level plan with Common Table Expressions.
*/
override def visitQuery(ctx: QueryContext): LogicalPlan = withOrigin(ctx) {
val query = plan(ctx.queryNoWith)
val query = plan(ctx.queryTerm).optionalMap(ctx.queryOrganization)(withQueryResultClauses)
// Apply CTEs
query.optionalMap(ctx.ctes)(withCTE)
@ -175,10 +175,6 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
}
}
override def visitQueryNoWith(ctx: QueryNoWithContext): LogicalPlan = withOrigin(ctx) {
plan(ctx.queryTerm).optionalMap(ctx.queryOrganization)(withQueryResultClauses)
}
/**
* Create a named logical plan.
*
@ -890,7 +886,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
* Create a logical plan for a sub-query.
*/
override def visitSubquery(ctx: SubqueryContext): LogicalPlan = withOrigin(ctx) {
plan(ctx.queryNoWith)
plan(ctx.query)
}
/**
@ -978,7 +974,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
* }}}
*/
override def visitAliasedQuery(ctx: AliasedQueryContext): LogicalPlan = withOrigin(ctx) {
val relation = plan(ctx.queryNoWith).optionalMap(ctx.sample)(withSample)
val relation = plan(ctx.query).optionalMap(ctx.sample)(withSample)
if (ctx.tableAlias.strictIdentifier == null) {
// For un-aliased subqueries, use a default alias name that is not likely to conflict with
// normal subquery names, so that parent operators can only access the columns in subquery by

View file

@ -4,6 +4,9 @@ create temporary view t2 as select * from values 0, 1 as t(id);
-- WITH clause should not fall into infinite loop by referencing self
WITH s AS (SELECT 1 FROM s) SELECT * FROM s;
WITH r AS (SELECT (SELECT * FROM r))
SELECT * FROM r;
-- WITH clause should reference the base table
WITH t AS (SELECT 1 FROM t) SELECT * FROM t;
@ -28,6 +31,112 @@ FROM CTE1 t1
WITH t(x) AS (SELECT 1)
SELECT * FROM t WHERE x = 1;
-- CTE in CTE definition
WITH t as (
WITH t2 AS (SELECT 1)
SELECT * FROM t2
)
SELECT * FROM t;
-- CTE in subquery
SELECT max(c) FROM (
WITH t(c) AS (SELECT 1)
SELECT * FROM t
);
-- CTE in subquery expression
SELECT (
WITH t AS (SELECT 1)
SELECT * FROM t
);
-- CTE in CTE definition shadows outer
WITH
t AS (SELECT 1),
t2 AS (
WITH t AS (SELECT 2)
SELECT * FROM t
)
SELECT * FROM t2;
-- CTE in CTE definition shadows outer 2
WITH
t(c) AS (SELECT 1),
t2 AS (
SELECT (
SELECT max(c) FROM (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
)
)
)
SELECT * FROM t2;
-- CTE in CTE definition shadows outer 3
WITH
t AS (SELECT 1),
t2 AS (
WITH t AS (SELECT 2),
t2 AS (
WITH t AS (SELECT 3)
SELECT * FROM t
)
SELECT * FROM t2
)
SELECT * FROM t2;
-- CTE in subquery shadows outer
WITH t(c) AS (SELECT 1)
SELECT max(c) FROM (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
);
-- CTE in subquery shadows outer 2
WITH t(c) AS (SELECT 1)
SELECT sum(c) FROM (
SELECT max(c) AS c FROM (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
)
);
-- CTE in subquery shadows outer 3
WITH t(c) AS (SELECT 1)
SELECT sum(c) FROM (
WITH t(c) AS (SELECT 2)
SELECT max(c) AS c FROM (
WITH t(c) AS (SELECT 3)
SELECT * FROM t
)
);
-- CTE in subquery expression shadows outer
WITH t AS (SELECT 1)
SELECT (
WITH t AS (SELECT 2)
SELECT * FROM t
);
-- CTE in subquery expression shadows outer 2
WITH t AS (SELECT 1)
SELECT (
SELECT (
WITH t AS (SELECT 2)
SELECT * FROM t
)
);
-- CTE in subquery expression shadows outer 3
WITH t AS (SELECT 1)
SELECT (
WITH t AS (SELECT 2)
SELECT (
WITH t AS (SELECT 3)
SELECT * FROM t
)
);
-- Clean up
DROP VIEW IF EXISTS t;
DROP VIEW IF EXISTS t2;

View file

@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 10
-- Number of queries: 23
-- !query 0
@ -28,36 +28,46 @@ Table or view not found: s; line 1 pos 25
-- !query 3
WITH t AS (SELECT 1 FROM t) SELECT * FROM t
WITH r AS (SELECT (SELECT * FROM r))
SELECT * FROM r
-- !query 3 schema
struct<1:int>
struct<>
-- !query 3 output
1
1
1
org.apache.spark.sql.AnalysisException
Table or view not found: r; line 1 pos 33
-- !query 4
WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2
WITH t AS (SELECT 1 FROM t) SELECT * FROM t
-- !query 4 schema
struct<>
struct<1:int>
-- !query 4 output
1
1
1
-- !query 5
WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2
-- !query 5 schema
struct<>
-- !query 5 output
org.apache.spark.sql.AnalysisException
Table or view not found: s2; line 1 pos 26
-- !query 5
WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2
-- !query 5 schema
struct<id:int,2:int>
-- !query 5 output
0 2
0 2
1 2
1 2
-- !query 6
WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2
-- !query 6 schema
struct<id:int,2:int>
-- !query 6 output
0 2
0 2
1 2
1 2
-- !query 7
WITH CTE1 AS (
SELECT b.id AS id
FROM T2 a
@ -67,47 +77,213 @@ SELECT t1.id AS c1,
t2.id AS c2
FROM CTE1 t1
CROSS JOIN CTE1 t2
-- !query 6 schema
struct<c1:int,c2:int>
-- !query 6 output
0 0
0 0
0 0
0 0
0 1
0 1
0 1
0 1
1 0
1 0
1 0
1 0
1 1
1 1
1 1
1 1
-- !query 7
WITH t(x) AS (SELECT 1)
SELECT * FROM t WHERE x = 1
-- !query 7 schema
struct<x:int>
struct<c1:int,c2:int>
-- !query 7 output
1
0 0
0 0
0 0
0 0
0 1
0 1
0 1
0 1
1 0
1 0
1 0
1 0
1 1
1 1
1 1
1 1
-- !query 8
DROP VIEW IF EXISTS t
WITH t(x) AS (SELECT 1)
SELECT * FROM t WHERE x = 1
-- !query 8 schema
struct<>
struct<x:int>
-- !query 8 output
1
-- !query 9
DROP VIEW IF EXISTS t2
WITH t as (
WITH t2 AS (SELECT 1)
SELECT * FROM t2
)
SELECT * FROM t
-- !query 9 schema
struct<>
struct<1:int>
-- !query 9 output
1
-- !query 10
SELECT max(c) FROM (
WITH t(c) AS (SELECT 1)
SELECT * FROM t
)
-- !query 10 schema
struct<max(c):int>
-- !query 10 output
1
-- !query 11
SELECT (
WITH t AS (SELECT 1)
SELECT * FROM t
)
-- !query 11 schema
struct<scalarsubquery():int>
-- !query 11 output
1
-- !query 12
WITH
t AS (SELECT 1),
t2 AS (
WITH t AS (SELECT 2)
SELECT * FROM t
)
SELECT * FROM t2
-- !query 12 schema
struct<1:int>
-- !query 12 output
1
-- !query 13
WITH
t(c) AS (SELECT 1),
t2 AS (
SELECT (
SELECT max(c) FROM (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
)
)
)
SELECT * FROM t2
-- !query 13 schema
struct<scalarsubquery():int>
-- !query 13 output
1
-- !query 14
WITH
t AS (SELECT 1),
t2 AS (
WITH t AS (SELECT 2),
t2 AS (
WITH t AS (SELECT 3)
SELECT * FROM t
)
SELECT * FROM t2
)
SELECT * FROM t2
-- !query 14 schema
struct<2:int>
-- !query 14 output
2
-- !query 15
WITH t(c) AS (SELECT 1)
SELECT max(c) FROM (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
)
-- !query 15 schema
struct<max(c):int>
-- !query 15 output
2
-- !query 16
WITH t(c) AS (SELECT 1)
SELECT sum(c) FROM (
SELECT max(c) AS c FROM (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
)
)
-- !query 16 schema
struct<sum(c):bigint>
-- !query 16 output
2
-- !query 17
WITH t(c) AS (SELECT 1)
SELECT sum(c) FROM (
WITH t(c) AS (SELECT 2)
SELECT max(c) AS c FROM (
WITH t(c) AS (SELECT 3)
SELECT * FROM t
)
)
-- !query 17 schema
struct<sum(c):bigint>
-- !query 17 output
3
-- !query 18
WITH t AS (SELECT 1)
SELECT (
WITH t AS (SELECT 2)
SELECT * FROM t
)
-- !query 18 schema
struct<scalarsubquery():int>
-- !query 18 output
1
-- !query 19
WITH t AS (SELECT 1)
SELECT (
SELECT (
WITH t AS (SELECT 2)
SELECT * FROM t
)
)
-- !query 19 schema
struct<scalarsubquery():int>
-- !query 19 output
1
-- !query 20
WITH t AS (SELECT 1)
SELECT (
WITH t AS (SELECT 2)
SELECT (
WITH t AS (SELECT 3)
SELECT * FROM t
)
)
-- !query 20 schema
struct<scalarsubquery():int>
-- !query 20 output
1
-- !query 21
DROP VIEW IF EXISTS t
-- !query 21 schema
struct<>
-- !query 21 output
-- !query 22
DROP VIEW IF EXISTS t2
-- !query 22 schema
struct<>
-- !query 22 output