[SPARK-28283][SQL][PYTHON][TESTS] Convert and port 'intersect-all.sql' into UDF test base

## What changes were proposed in this pull request?

This PR adds some tests converted from `intersect-all.sql` to test UDFs. Please see contribution guide of this umbrella ticket - [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921).

<details><summary>Diff comparing to 'intersect-all.sql'</summary>
<p>

```diff
diff --git a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out
index 63dd56ce46..0cb82be2da 100644
--- a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out
 -34,11 +34,11  struct<>

 -- !query 2
-SELECT * FROM tab1
+SELECT udf(k), v FROM tab1
 INTERSECT ALL
-SELECT * FROM tab2
+SELECT k, udf(v) FROM tab2
 -- !query 2 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
 -- !query 2 output
 1	2
 1	2
 -48,11 +48,11  NULL	NULL

 -- !query 3
-SELECT * FROM tab1
+SELECT k, udf(v) FROM tab1
 INTERSECT ALL
-SELECT * FROM tab1 WHERE k = 1
+SELECT udf(k), v FROM tab1 WHERE udf(k) = 1
 -- !query 3 schema
-struct<k:int,v:int>
+struct<k:int,CAST(udf(cast(v as string)) AS INT):int>
 -- !query 3 output
 1	2
 1	2
 -61,39 +61,39  struct<k:int,v:int>

 -- !query 4
-SELECT * FROM tab1 WHERE k > 2
+SELECT udf(k), udf(v) FROM tab1 WHERE k > udf(2)
 INTERSECT ALL
-SELECT * FROM tab2
+SELECT udf(k), udf(v) FROM tab2
 -- !query 4 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
 -- !query 4 output

 -- !query 5
-SELECT * FROM tab1
+SELECT udf(k), v FROM tab1
 INTERSECT ALL
-SELECT * FROM tab2 WHERE k > 3
+SELECT udf(k), v FROM tab2 WHERE udf(udf(k)) > 3
 -- !query 5 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
 -- !query 5 output

 -- !query 6
-SELECT * FROM tab1
+SELECT udf(k), v FROM tab1
 INTERSECT ALL
-SELECT CAST(1 AS BIGINT), CAST(2 AS BIGINT)
+SELECT CAST(udf(1) AS BIGINT), CAST(udf(2) AS BIGINT)
 -- !query 6 schema
-struct<k:bigint,v:bigint>
+struct<CAST(udf(cast(k as string)) AS INT):bigint,v:bigint>
 -- !query 6 output
 1	2

 -- !query 7
-SELECT * FROM tab1
+SELECT k, udf(v) FROM tab1
 INTERSECT ALL
-SELECT array(1), 2
+SELECT array(1), udf(2)
 -- !query 7 schema
 struct<>
 -- !query 7 output
 -102,9 +102,9  IntersectAll can only be performed on tables with the compatible column types. a

 -- !query 8
-SELECT k FROM tab1
+SELECT udf(k) FROM tab1
 INTERSECT ALL
-SELECT k, v FROM tab2
+SELECT udf(k), udf(v) FROM tab2
 -- !query 8 schema
 struct<>
 -- !query 8 output
 -113,13 +113,13  IntersectAll can only be performed on tables with the same number of columns, bu

 -- !query 9
-SELECT * FROM tab2
+SELECT udf(k), v FROM tab2
 INTERSECT ALL
-SELECT * FROM tab1
+SELECT k, udf(v) FROM tab1
 INTERSECT ALL
-SELECT * FROM tab2
+SELECT udf(k), udf(v) FROM tab2
 -- !query 9 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
 -- !query 9 output
 1	2
 1	2
 -129,15 +129,15  NULL	NULL

 -- !query 10
-SELECT * FROM tab1
+SELECT udf(k), v FROM tab1
 EXCEPT
-SELECT * FROM tab2
+SELECT k, udf(v) FROM tab2
 UNION ALL
-SELECT * FROM tab1
+SELECT k, udf(udf(v)) FROM tab1
 INTERSECT ALL
-SELECT * FROM tab2
+SELECT udf(k), v FROM tab2
 -- !query 10 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
 -- !query 10 output
 1	2
 1	2
 -148,15 +148,15  NULL	NULL

 -- !query 11
-SELECT * FROM tab1
+SELECT udf(k), udf(v) FROM tab1
 EXCEPT
-SELECT * FROM tab2
+SELECT udf(k), v FROM tab2
 EXCEPT
-SELECT * FROM tab1
+SELECT k, udf(v) FROM tab1
 INTERSECT ALL
-SELECT * FROM tab2
+SELECT udf(k), udf(udf(v)) FROM tab2
 -- !query 11 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
 -- !query 11 output
 1	3

 -165,38 +165,38  struct<k:int,v:int>
 (
   (
     (
-      SELECT * FROM tab1
+      SELECT udf(k), v FROM tab1
       EXCEPT
-      SELECT * FROM tab2
+      SELECT k, udf(v) FROM tab2
     )
     EXCEPT
-    SELECT * FROM tab1
+    SELECT udf(k), udf(v) FROM tab1
   )
   INTERSECT ALL
-  SELECT * FROM tab2
+  SELECT udf(k), udf(v) FROM tab2
 )
 -- !query 12 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
 -- !query 12 output

 -- !query 13
 SELECT *
-FROM   (SELECT tab1.k,
-               tab2.v
+FROM   (SELECT udf(tab1.k),
+               udf(tab2.v)
         FROM   tab1
                JOIN tab2
-                 ON tab1.k = tab2.k)
+                 ON udf(udf(tab1.k)) = tab2.k)
 INTERSECT ALL
 SELECT *
-FROM   (SELECT tab1.k,
-               tab2.v
+FROM   (SELECT udf(tab1.k),
+               udf(tab2.v)
         FROM   tab1
                JOIN tab2
-                 ON tab1.k = tab2.k)
+                 ON udf(tab1.k) = udf(udf(tab2.k)))
 -- !query 13 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
 -- !query 13 output
 1	2
 1	2
 -211,30 +211,30  struct<k:int,v:int>

 -- !query 14
 SELECT *
-FROM   (SELECT tab1.k,
-               tab2.v
+FROM   (SELECT udf(tab1.k),
+               udf(tab2.v)
         FROM   tab1
                JOIN tab2
-                 ON tab1.k = tab2.k)
+                 ON udf(tab1.k) = udf(tab2.k))
 INTERSECT ALL
 SELECT *
-FROM   (SELECT tab2.v AS k,
-               tab1.k AS v
+FROM   (SELECT udf(tab2.v) AS k,
+               udf(tab1.k) AS v
         FROM   tab1
                JOIN tab2
-                 ON tab1.k = tab2.k)
+                 ON tab1.k = udf(tab2.k))
 -- !query 14 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
 -- !query 14 output

 -- !query 15
-SELECT v FROM tab1 GROUP BY v
+SELECT udf(v) FROM tab1 GROUP BY v
 INTERSECT ALL
-SELECT k FROM tab2 GROUP BY k
+SELECT udf(udf(k)) FROM tab2 GROUP BY k
 -- !query 15 schema
-struct<v:int>
+struct<CAST(udf(cast(v as string)) AS INT):int>
 -- !query 15 output
 2
 3
 -250,15 +250,15  spark.sql.legacy.setopsPrecedence.enabled	true

 -- !query 17
-SELECT * FROM tab1
+SELECT udf(k), v FROM tab1
 EXCEPT
-SELECT * FROM tab2
+SELECT k, udf(v) FROM tab2
 UNION ALL
-SELECT * FROM tab1
+SELECT udf(k), udf(v) FROM tab1
 INTERSECT ALL
-SELECT * FROM tab2
+SELECT udf(udf(k)), udf(v) FROM tab2
 -- !query 17 schema
-struct<k:int,v:int>
+struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
 -- !query 17 output
 1	2
 1	2
 -268,15 +268,15  NULL	NULL

 -- !query 18
-SELECT * FROM tab1
+SELECT k, udf(v) FROM tab1
 EXCEPT
-SELECT * FROM tab2
+SELECT udf(k), v FROM tab2
 UNION ALL
-SELECT * FROM tab1
+SELECT udf(k), udf(v) FROM tab1
 INTERSECT
-SELECT * FROM tab2
+SELECT udf(k), udf(udf(v)) FROM tab2
 -- !query 18 schema
-struct<k:int,v:int>
+struct<k:int,CAST(udf(cast(v as string)) AS INT):int>
 -- !query 18 output
 1	2
 2	3

```
</p>
</details>

## How was this patch tested?

Tested as guided in [SPARK-27921](https://issues.apache.org/jira/browse/SPARK-27921).

Closes #25119 from imback82/intersect-all-sql.

Authored-by: Terry Kim <yuminkim@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
Terry Kim 2019-07-18 19:49:57 +09:00 committed by HyukjinKwon
parent 4645ffb08a
commit 62004f1c0f
2 changed files with 469 additions and 0 deletions

View file

@ -0,0 +1,162 @@
-- This test file was converted from intersect-all.sql.
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
(1, 2),
(1, 2),
(1, 3),
(1, 3),
(2, 3),
(null, null),
(null, null)
AS tab1(k, v);
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
(1, 2),
(1, 2),
(2, 3),
(3, 4),
(null, null),
(null, null)
AS tab2(k, v);
-- Basic INTERSECT ALL
SELECT udf(k), v FROM tab1
INTERSECT ALL
SELECT k, udf(v) FROM tab2;
-- INTERSECT ALL same table in both branches
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT udf(k), v FROM tab1 WHERE udf(k) = 1;
-- Empty left relation
SELECT udf(k), udf(v) FROM tab1 WHERE k > udf(2)
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2;
-- Empty right relation
SELECT udf(k), v FROM tab1
INTERSECT ALL
SELECT udf(k), v FROM tab2 WHERE udf(udf(k)) > 3;
-- Type Coerced INTERSECT ALL
SELECT udf(k), v FROM tab1
INTERSECT ALL
SELECT CAST(udf(1) AS BIGINT), CAST(udf(2) AS BIGINT);
-- Error as types of two side are not compatible
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT array(1), udf(2);
-- Mismatch on number of columns across both branches
SELECT udf(k) FROM tab1
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2;
-- Basic
SELECT udf(k), v FROM tab2
INTERSECT ALL
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2;
-- Chain of different `set operations
SELECT udf(k), v FROM tab1
EXCEPT
SELECT k, udf(v) FROM tab2
UNION ALL
SELECT k, udf(udf(v)) FROM tab1
INTERSECT ALL
SELECT udf(k), v FROM tab2
;
-- Chain of different `set operations
SELECT udf(k), udf(v) FROM tab1
EXCEPT
SELECT udf(k), v FROM tab2
EXCEPT
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT udf(k), udf(udf(v)) FROM tab2
;
-- test use parenthesis to control order of evaluation
(
(
(
SELECT udf(k), v FROM tab1
EXCEPT
SELECT k, udf(v) FROM tab2
)
EXCEPT
SELECT udf(k), udf(v) FROM tab1
)
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2
)
;
-- Join under intersect all
SELECT *
FROM (SELECT udf(tab1.k),
udf(tab2.v)
FROM tab1
JOIN tab2
ON udf(udf(tab1.k)) = tab2.k)
INTERSECT ALL
SELECT *
FROM (SELECT udf(tab1.k),
udf(tab2.v)
FROM tab1
JOIN tab2
ON udf(tab1.k) = udf(udf(tab2.k)));
-- Join under intersect all (2)
SELECT *
FROM (SELECT udf(tab1.k),
udf(tab2.v)
FROM tab1
JOIN tab2
ON udf(tab1.k) = udf(tab2.k))
INTERSECT ALL
SELECT *
FROM (SELECT udf(tab2.v) AS k,
udf(tab1.k) AS v
FROM tab1
JOIN tab2
ON tab1.k = udf(tab2.k));
-- Group by under intersect all
SELECT udf(v) FROM tab1 GROUP BY v
INTERSECT ALL
SELECT udf(udf(k)) FROM tab2 GROUP BY k;
-- Test pre spark2.4 behaviour of set operation precedence
-- All the set operators are given equal precedence and are evaluated
-- from left to right as they appear in the query.
-- Set the property
SET spark.sql.legacy.setopsPrecedence.enabled= true;
SELECT udf(k), v FROM tab1
EXCEPT
SELECT k, udf(v) FROM tab2
UNION ALL
SELECT udf(k), udf(v) FROM tab1
INTERSECT ALL
SELECT udf(udf(k)), udf(v) FROM tab2;
SELECT k, udf(v) FROM tab1
EXCEPT
SELECT udf(k), v FROM tab2
UNION ALL
SELECT udf(k), udf(v) FROM tab1
INTERSECT
SELECT udf(k), udf(udf(v)) FROM tab2;
-- Restore the property
SET spark.sql.legacy.setopsPrecedence.enabled = false;
-- Clean-up
DROP VIEW IF EXISTS tab1;
DROP VIEW IF EXISTS tab2;

View file

@ -0,0 +1,307 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 22
-- !query 0
CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES
(1, 2),
(1, 2),
(1, 3),
(1, 3),
(2, 3),
(null, null),
(null, null)
AS tab1(k, v)
-- !query 0 schema
struct<>
-- !query 0 output
-- !query 1
CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES
(1, 2),
(1, 2),
(2, 3),
(3, 4),
(null, null),
(null, null)
AS tab2(k, v)
-- !query 1 schema
struct<>
-- !query 1 output
-- !query 2
SELECT udf(k), v FROM tab1
INTERSECT ALL
SELECT k, udf(v) FROM tab2
-- !query 2 schema
struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
-- !query 2 output
1 2
1 2
2 3
NULL NULL
NULL NULL
-- !query 3
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT udf(k), v FROM tab1 WHERE udf(k) = 1
-- !query 3 schema
struct<k:int,CAST(udf(cast(v as string)) AS INT):int>
-- !query 3 output
1 2
1 2
1 3
1 3
-- !query 4
SELECT udf(k), udf(v) FROM tab1 WHERE k > udf(2)
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2
-- !query 4 schema
struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
-- !query 4 output
-- !query 5
SELECT udf(k), v FROM tab1
INTERSECT ALL
SELECT udf(k), v FROM tab2 WHERE udf(udf(k)) > 3
-- !query 5 schema
struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
-- !query 5 output
-- !query 6
SELECT udf(k), v FROM tab1
INTERSECT ALL
SELECT CAST(udf(1) AS BIGINT), CAST(udf(2) AS BIGINT)
-- !query 6 schema
struct<CAST(udf(cast(k as string)) AS INT):bigint,v:bigint>
-- !query 6 output
1 2
-- !query 7
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT array(1), udf(2)
-- !query 7 schema
struct<>
-- !query 7 output
org.apache.spark.sql.AnalysisException
IntersectAll can only be performed on tables with the compatible column types. array<int> <> int at the first column of the second table;
-- !query 8
SELECT udf(k) FROM tab1
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2
-- !query 8 schema
struct<>
-- !query 8 output
org.apache.spark.sql.AnalysisException
IntersectAll can only be performed on tables with the same number of columns, but the first table has 1 columns and the second table has 2 columns;
-- !query 9
SELECT udf(k), v FROM tab2
INTERSECT ALL
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2
-- !query 9 schema
struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
-- !query 9 output
1 2
1 2
2 3
NULL NULL
NULL NULL
-- !query 10
SELECT udf(k), v FROM tab1
EXCEPT
SELECT k, udf(v) FROM tab2
UNION ALL
SELECT k, udf(udf(v)) FROM tab1
INTERSECT ALL
SELECT udf(k), v FROM tab2
-- !query 10 schema
struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
-- !query 10 output
1 2
1 2
1 3
2 3
NULL NULL
NULL NULL
-- !query 11
SELECT udf(k), udf(v) FROM tab1
EXCEPT
SELECT udf(k), v FROM tab2
EXCEPT
SELECT k, udf(v) FROM tab1
INTERSECT ALL
SELECT udf(k), udf(udf(v)) FROM tab2
-- !query 11 schema
struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
-- !query 11 output
1 3
-- !query 12
(
(
(
SELECT udf(k), v FROM tab1
EXCEPT
SELECT k, udf(v) FROM tab2
)
EXCEPT
SELECT udf(k), udf(v) FROM tab1
)
INTERSECT ALL
SELECT udf(k), udf(v) FROM tab2
)
-- !query 12 schema
struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
-- !query 12 output
-- !query 13
SELECT *
FROM (SELECT udf(tab1.k),
udf(tab2.v)
FROM tab1
JOIN tab2
ON udf(udf(tab1.k)) = tab2.k)
INTERSECT ALL
SELECT *
FROM (SELECT udf(tab1.k),
udf(tab2.v)
FROM tab1
JOIN tab2
ON udf(tab1.k) = udf(udf(tab2.k)))
-- !query 13 schema
struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
-- !query 13 output
1 2
1 2
1 2
1 2
1 2
1 2
1 2
1 2
2 3
-- !query 14
SELECT *
FROM (SELECT udf(tab1.k),
udf(tab2.v)
FROM tab1
JOIN tab2
ON udf(tab1.k) = udf(tab2.k))
INTERSECT ALL
SELECT *
FROM (SELECT udf(tab2.v) AS k,
udf(tab1.k) AS v
FROM tab1
JOIN tab2
ON tab1.k = udf(tab2.k))
-- !query 14 schema
struct<CAST(udf(cast(k as string)) AS INT):int,CAST(udf(cast(v as string)) AS INT):int>
-- !query 14 output
-- !query 15
SELECT udf(v) FROM tab1 GROUP BY v
INTERSECT ALL
SELECT udf(udf(k)) FROM tab2 GROUP BY k
-- !query 15 schema
struct<CAST(udf(cast(v as string)) AS INT):int>
-- !query 15 output
2
3
NULL
-- !query 16
SET spark.sql.legacy.setopsPrecedence.enabled= true
-- !query 16 schema
struct<key:string,value:string>
-- !query 16 output
spark.sql.legacy.setopsPrecedence.enabled true
-- !query 17
SELECT udf(k), v FROM tab1
EXCEPT
SELECT k, udf(v) FROM tab2
UNION ALL
SELECT udf(k), udf(v) FROM tab1
INTERSECT ALL
SELECT udf(udf(k)), udf(v) FROM tab2
-- !query 17 schema
struct<CAST(udf(cast(k as string)) AS INT):int,v:int>
-- !query 17 output
1 2
1 2
2 3
NULL NULL
NULL NULL
-- !query 18
SELECT k, udf(v) FROM tab1
EXCEPT
SELECT udf(k), v FROM tab2
UNION ALL
SELECT udf(k), udf(v) FROM tab1
INTERSECT
SELECT udf(k), udf(udf(v)) FROM tab2
-- !query 18 schema
struct<k:int,CAST(udf(cast(v as string)) AS INT):int>
-- !query 18 output
1 2
2 3
NULL NULL
-- !query 19
SET spark.sql.legacy.setopsPrecedence.enabled = false
-- !query 19 schema
struct<key:string,value:string>
-- !query 19 output
spark.sql.legacy.setopsPrecedence.enabled false
-- !query 20
DROP VIEW IF EXISTS tab1
-- !query 20 schema
struct<>
-- !query 20 output
-- !query 21
DROP VIEW IF EXISTS tab2
-- !query 21 schema
struct<>
-- !query 21 output