[SPARK-28392][SQL][TESTS] Add traits for UDF and PostgreSQL tests to share initialization

## What changes were proposed in this pull request?

This PR adds some traits so that we can deduplicate initialization stuff for each type of test case. For instance, see [SPARK-28343](https://issues.apache.org/jira/browse/SPARK-28343).

It's a little bit overkill but I think it will make adding test cases easier and cause less confusions.

This PR adds both:

```
  private trait PgSQLTest
  private trait UDFTest
```

To indicate and share the logics related to each combination of test types.

## How was this patch tested?

Manually tested.

Closes #25155 from HyukjinKwon/SPARK-28392.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
This commit is contained in:
HyukjinKwon 2019-07-15 16:20:09 +09:00
parent 72cc853092
commit a7a02a86ad
3 changed files with 141 additions and 134 deletions

View file

@ -6,14 +6,10 @@
-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/case.sql
-- Test the CASE statement
--
-- This test suite contains two Cartesian products without using explicit CROSS JOIN syntax.
-- Thus, we set spark.sql.crossJoin.enabled to true.
-- This test file was converted from pgSQL/case.sql.
-- Note that currently registered UDF returns a string. So there are some differences, for instance
-- in string cast within UDF in Scala and Python.
set spark.sql.crossJoin.enabled=true;
CREATE TABLE CASE_TBL (
i integer,
f double
@ -269,4 +265,3 @@ SELECT CASE
DROP TABLE CASE_TBL;
DROP TABLE CASE2_TBL;
set spark.sql.crossJoin.enabled=false;

View file

@ -1,20 +1,23 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 37
-- Number of queries: 35
-- !query 0
set spark.sql.crossJoin.enabled=true
-- !query 0 schema
struct<key:string,value:string>
-- !query 0 output
spark.sql.crossJoin.enabled true
-- !query 1
CREATE TABLE CASE_TBL (
i integer,
f double
) USING parquet
-- !query 0 schema
struct<>
-- !query 0 output
-- !query 1
CREATE TABLE CASE2_TBL (
i integer,
j integer
) USING parquet
-- !query 1 schema
struct<>
-- !query 1 output
@ -22,10 +25,7 @@ struct<>
-- !query 2
CREATE TABLE CASE2_TBL (
i integer,
j integer
) USING parquet
INSERT INTO CASE_TBL VALUES (1, 10.1)
-- !query 2 schema
struct<>
-- !query 2 output
@ -33,7 +33,7 @@ struct<>
-- !query 3
INSERT INTO CASE_TBL VALUES (1, 10.1)
INSERT INTO CASE_TBL VALUES (2, 20.2)
-- !query 3 schema
struct<>
-- !query 3 output
@ -41,7 +41,7 @@ struct<>
-- !query 4
INSERT INTO CASE_TBL VALUES (2, 20.2)
INSERT INTO CASE_TBL VALUES (3, -30.3)
-- !query 4 schema
struct<>
-- !query 4 output
@ -49,7 +49,7 @@ struct<>
-- !query 5
INSERT INTO CASE_TBL VALUES (3, -30.3)
INSERT INTO CASE_TBL VALUES (4, NULL)
-- !query 5 schema
struct<>
-- !query 5 output
@ -57,7 +57,7 @@ struct<>
-- !query 6
INSERT INTO CASE_TBL VALUES (4, NULL)
INSERT INTO CASE2_TBL VALUES (1, -1)
-- !query 6 schema
struct<>
-- !query 6 output
@ -65,7 +65,7 @@ struct<>
-- !query 7
INSERT INTO CASE2_TBL VALUES (1, -1)
INSERT INTO CASE2_TBL VALUES (2, -2)
-- !query 7 schema
struct<>
-- !query 7 output
@ -73,7 +73,7 @@ struct<>
-- !query 8
INSERT INTO CASE2_TBL VALUES (2, -2)
INSERT INTO CASE2_TBL VALUES (3, -3)
-- !query 8 schema
struct<>
-- !query 8 output
@ -81,7 +81,7 @@ struct<>
-- !query 9
INSERT INTO CASE2_TBL VALUES (3, -3)
INSERT INTO CASE2_TBL VALUES (2, -4)
-- !query 9 schema
struct<>
-- !query 9 output
@ -89,7 +89,7 @@ struct<>
-- !query 10
INSERT INTO CASE2_TBL VALUES (2, -4)
INSERT INTO CASE2_TBL VALUES (1, NULL)
-- !query 10 schema
struct<>
-- !query 10 output
@ -97,7 +97,7 @@ struct<>
-- !query 11
INSERT INTO CASE2_TBL VALUES (1, NULL)
INSERT INTO CASE2_TBL VALUES (NULL, -6)
-- !query 11 schema
struct<>
-- !query 11 output
@ -105,148 +105,140 @@ struct<>
-- !query 12
INSERT INTO CASE2_TBL VALUES (NULL, -6)
-- !query 12 schema
struct<>
-- !query 12 output
-- !query 13
SELECT '3' AS `One`,
CASE
WHEN CAST(udf(1 < 2) AS boolean) THEN 3
END AS `Simple WHEN`
-- !query 13 schema
-- !query 12 schema
struct<One:string,Simple WHEN:int>
-- !query 13 output
-- !query 12 output
3 3
-- !query 14
-- !query 13
SELECT '<NULL>' AS `One`,
CASE
WHEN 1 > 2 THEN udf(3)
END AS `Simple default`
-- !query 14 schema
-- !query 13 schema
struct<One:string,Simple default:string>
-- !query 14 output
-- !query 13 output
<NULL> NULL
-- !query 15
-- !query 14
SELECT '3' AS `One`,
CASE
WHEN udf(1) < 2 THEN udf(3)
ELSE udf(4)
END AS `Simple ELSE`
-- !query 15 schema
-- !query 14 schema
struct<One:string,Simple ELSE:string>
-- !query 15 output
-- !query 14 output
3 3
-- !query 16
-- !query 15
SELECT udf('4') AS `One`,
CASE
WHEN 1 > 2 THEN 3
ELSE 4
END AS `ELSE default`
-- !query 16 schema
-- !query 15 schema
struct<One:string,ELSE default:int>
-- !query 16 output
-- !query 15 output
4 4
-- !query 17
-- !query 16
SELECT udf('6') AS `One`,
CASE
WHEN CAST(udf(1 > 2) AS boolean) THEN 3
WHEN udf(4) < 5 THEN 6
ELSE 7
END AS `Two WHEN with default`
-- !query 17 schema
-- !query 16 schema
struct<One:string,Two WHEN with default:int>
-- !query 17 output
-- !query 16 output
6 6
-- !query 18
-- !query 17
SELECT '7' AS `None`,
CASE WHEN rand() < udf(0) THEN 1
END AS `NULL on no matches`
-- !query 18 schema
-- !query 17 schema
struct<None:string,NULL on no matches:int>
-- !query 18 output
-- !query 17 output
7 NULL
-- !query 19
-- !query 18
SELECT CASE WHEN CAST(udf(1=0) AS boolean) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END
-- !query 19 schema
-- !query 18 schema
struct<CASE WHEN CAST(udf((1 = 0)) AS BOOLEAN) THEN (CAST(1 AS DOUBLE) / CAST(0 AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double>
-- !query 18 output
1.0
-- !query 19
SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END
-- !query 19 schema
struct<CASE WHEN (1 = 0) THEN (CAST(1 AS DOUBLE) / CAST(CAST(udf(0) AS DOUBLE) AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double>
-- !query 19 output
1.0
-- !query 20
SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END
SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl
-- !query 20 schema
struct<CASE WHEN (1 = 0) THEN (CAST(1 AS DOUBLE) / CAST(CAST(udf(0) AS DOUBLE) AS DOUBLE)) WHEN (1 = 1) THEN CAST(1 AS DOUBLE) ELSE (CAST(2 AS DOUBLE) / CAST(0 AS DOUBLE)) END:double>
struct<CASE WHEN (i > 100) THEN udf((cast(1 as double) / cast(0 as double))) ELSE udf(0) END:string>
-- !query 20 output
1.0
0
0
0
0
-- !query 21
SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl
-- !query 21 schema
struct<CASE WHEN (i > 100) THEN udf((cast(1 as double) / cast(0 as double))) ELSE udf(0) END:string>
-- !query 21 output
0
0
0
0
-- !query 22
SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END
-- !query 22 schema
-- !query 21 schema
struct<CASE WHEN (a = a) THEN udf(1) ELSE udf(2) END:string>
-- !query 22 output
-- !query 21 output
1
-- !query 23
-- !query 22
SELECT '' AS `Five`,
CASE
WHEN i >= 3 THEN i
END AS `>= 3 or Null`
FROM CASE_TBL
-- !query 23 schema
-- !query 22 schema
struct<Five:string,>= 3 or Null:int>
-- !query 23 output
-- !query 22 output
3
4
NULL
NULL
-- !query 24
-- !query 23
SELECT '' AS `Five`,
CASE WHEN i >= 3 THEN (i + i)
ELSE i
END AS `Simplest Math`
FROM CASE_TBL
-- !query 24 schema
-- !query 23 schema
struct<Five:string,Simplest Math:int>
-- !query 24 output
-- !query 23 output
1
2
6
8
-- !query 25
-- !query 24
SELECT '' AS `Five`, i AS `Value`,
CASE WHEN (i < 0) THEN 'small'
WHEN (i = 0) THEN 'zero'
@ -255,16 +247,16 @@ SELECT '' AS `Five`, i AS `Value`,
ELSE 'big'
END AS `Category`
FROM CASE_TBL
-- !query 25 schema
-- !query 24 schema
struct<Five:string,Value:int,Category:string>
-- !query 25 output
-- !query 24 output
1 one
2 two
3 big
4 big
-- !query 26
-- !query 25
SELECT '' AS `Five`,
CASE WHEN ((i < 0) or (i < 0)) THEN 'small'
WHEN ((i = 0) or (i = 0)) THEN 'zero'
@ -273,37 +265,37 @@ SELECT '' AS `Five`,
ELSE 'big'
END AS `Category`
FROM CASE_TBL
-- !query 26 schema
-- !query 25 schema
struct<Five:string,Category:string>
-- !query 26 output
-- !query 25 output
big
big
one
two
-- !query 27
-- !query 26
SELECT * FROM CASE_TBL WHERE udf(COALESCE(f,i)) = 4
-- !query 27 schema
-- !query 26 schema
struct<i:int,f:double>
-- !query 27 output
-- !query 26 output
4 NULL
-- !query 28
-- !query 27
SELECT * FROM CASE_TBL WHERE udf(NULLIF(f,i)) = 2
-- !query 28 schema
-- !query 27 schema
struct<i:int,f:double>
-- !query 28 output
-- !query 27 output
-- !query 29
-- !query 28
SELECT udf(COALESCE(a.f, b.i, b.j))
FROM CASE_TBL a, CASE2_TBL b
-- !query 29 schema
-- !query 28 schema
struct<udf(coalesce(f, cast(i as double), cast(j as double))):string>
-- !query 29 output
-- !query 28 output
-30.3
-30.3
-30.3
@ -330,24 +322,24 @@ struct<udf(coalesce(f, cast(i as double), cast(j as double))):string>
3.0
-- !query 30
-- !query 29
SELECT *
FROM CASE_TBL a, CASE2_TBL b
WHERE udf(COALESCE(a.f, b.i, b.j)) = 2
-- !query 30 schema
-- !query 29 schema
struct<i:int,f:double,i:int,j:int>
-- !query 30 output
-- !query 29 output
4 NULL 2 -2
4 NULL 2 -4
-- !query 31
-- !query 30
SELECT udf('') AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`,
NULLIF(b.i, 4) AS `NULLIF(b.i,4)`
FROM CASE_TBL a, CASE2_TBL b
-- !query 31 schema
-- !query 30 schema
struct<Five:string,NULLIF(a.i,b.i):int,NULLIF(b.i,4):int>
-- !query 31 output
-- !query 30 output
1 2
1 2
1 3
@ -374,18 +366,18 @@ struct<Five:string,NULLIF(a.i,b.i):int,NULLIF(b.i,4):int>
NULL 3
-- !query 32
-- !query 31
SELECT '' AS `Two`, *
FROM CASE_TBL a, CASE2_TBL b
WHERE CAST(udf(COALESCE(f,b.i) = 2) AS boolean)
-- !query 32 schema
-- !query 31 schema
struct<Two:string,i:int,f:double,i:int,j:int>
-- !query 32 output
-- !query 31 output
4 NULL 2 -2
4 NULL 2 -4
-- !query 33
-- !query 32
SELECT CASE
(CASE vol('bar')
WHEN udf('foo') THEN 'it was foo!'
@ -395,31 +387,23 @@ SELECT CASE
WHEN udf('it was foo!') THEN 'foo recognized'
WHEN 'it was bar!' THEN udf('bar recognized')
ELSE 'unrecognized' END AS col
-- !query 33 schema
-- !query 32 schema
struct<col:string>
-- !query 33 output
-- !query 32 output
bar recognized
-- !query 34
-- !query 33
DROP TABLE CASE_TBL
-- !query 33 schema
struct<>
-- !query 33 output
-- !query 34
DROP TABLE CASE2_TBL
-- !query 34 schema
struct<>
-- !query 34 output
-- !query 35
DROP TABLE CASE2_TBL
-- !query 35 schema
struct<>
-- !query 35 output
-- !query 36
set spark.sql.crossJoin.enabled=false
-- !query 36 schema
struct<key:string,value:string>
-- !query 36 output
spark.sql.crossJoin.enabled false

View file

@ -151,17 +151,37 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
val resultFile: String
}
/**
* traits that indicate UDF or PgSQL to trigger the code path specific to each. For instance,
* PgSQL tests require to register some UDF functions.
*/
private trait PgSQLTest
private trait UDFTest {
val udf: TestUDF
}
/** A regular test case. */
private case class RegularTestCase(
name: String, inputFile: String, resultFile: String) extends TestCase
/** A PostgreSQL test case. */
private case class PgSQLTestCase(
name: String, inputFile: String, resultFile: String) extends TestCase
name: String, inputFile: String, resultFile: String) extends TestCase with PgSQLTest
/** A UDF test case. */
private case class UDFTestCase(
name: String, inputFile: String, resultFile: String, udf: TestUDF) extends TestCase
name: String,
inputFile: String,
resultFile: String,
udf: TestUDF) extends TestCase with UDFTest
/** A UDF PostgreSQL test case. */
private case class UDFPgSQLTestCase(
name: String,
inputFile: String,
resultFile: String,
udf: TestUDF) extends TestCase with UDFTest with PgSQLTest
private def createScalaTestCase(testCase: TestCase): Unit = {
if (blackList.exists(t =>
@ -169,12 +189,14 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
// Create a test case to ignore this case.
ignore(testCase.name) { /* Do nothing */ }
} else testCase match {
case UDFTestCase(_, _, _, udf: TestPythonUDF) if !shouldTestPythonUDFs =>
case udfTestCase: UDFTest
if udfTestCase.udf.isInstanceOf[TestPythonUDF] && !shouldTestPythonUDFs =>
ignore(s"${testCase.name} is skipped because " +
s"[$pythonExec] and/or pyspark were not available.") {
/* Do nothing */
}
case UDFTestCase(_, _, _, udf: TestScalarPandasUDF) if !shouldTestScalarPandasUDFs =>
case udfTestCase: UDFTest
if udfTestCase.udf.isInstanceOf[TestScalarPandasUDF] && !shouldTestScalarPandasUDFs =>
ignore(s"${testCase.name} is skipped because pyspark," +
s"pandas and/or pyarrow were not available in [$pythonExec].") {
/* Do nothing */
@ -254,12 +276,15 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
// This does not isolate catalog changes.
val localSparkSession = spark.newSession()
loadTestData(localSparkSession)
testCase match {
case udfTestCase: UDFTestCase =>
// vol used by udf-case.sql.
localSparkSession.udf.register("vol", (s: String) => s)
case udfTestCase: UDFTest =>
registerTestUDF(udfTestCase.udf, localSparkSession)
case _: PgSQLTestCase =>
case _ =>
}
testCase match {
case _: PgSQLTest =>
// booleq/boolne used by boolean.sql
localSparkSession.udf.register("booleq", (b1: Boolean, b2: Boolean) => b1 == b2)
localSparkSession.udf.register("boolne", (b1: Boolean, b2: Boolean) => b1 != b2)
@ -268,7 +293,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
// PostgreSQL enabled cartesian product by default.
localSparkSession.conf.set(SQLConf.CROSS_JOINS_ENABLED.key, true)
localSparkSession.conf.set(SQLConf.ANSI_SQL_PARSER.key, true)
case _ => // Don't add UDFs in Regular tests.
case _ =>
}
if (configSet.isDefined) {
@ -388,13 +413,16 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
val absPath = file.getAbsolutePath
val testCaseName = absPath.stripPrefix(inputFilePath).stripPrefix(File.separator)
if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}udf")) {
if (file.getAbsolutePath.startsWith(
s"$inputFilePath${File.separator}udf${File.separator}pgSQL")) {
Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { udf =>
UDFPgSQLTestCase(
s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf)
}
} else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}udf")) {
Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { udf =>
UDFTestCase(
s"$testCaseName - ${udf.prettyName}",
absPath,
resultFile,
udf)
s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf)
}
} else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}pgSQL")) {
PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil