[SPARK-31291][SQL][TEST] SQLQueryTestSuite: Sharing test data and test tables among multiple test cases
### What changes were proposed in this pull request? `SQLQueryTestSuite` spend 35 minutes time to test. I've listed the 10 test cases that took the longest time in the `SQL` module below. Class | Spend time ↑ | Failure | Skip | Pass | Total test case -- | -- | -- | -- | -- | -- SQLQueryTestSuite | 35 minutes | 0 | 1 | 230 | 231 TPCDSQuerySuite | 3 minutes 8 seconds | 0 | 0 | 156 | 156 SQLQuerySuite | 2 minutes 52 seconds | 0 | 0 | 185 | 185 DynamicPartitionPruningSuiteAEOff | 1 minutes 52 seconds | 0 | 0 | 22 | 22 DataFrameFunctionsSuite | 1 minutes 37 seconds | 0 | 0 | 102 | 102 DynamicPartitionPruningSuiteAEOn | 1 minutes 24 seconds | 0 | 0 | 22 | 22 DataFrameSuite | 1 minutes 14 seconds | 0 | 2 | 157 | 159 SubquerySuite | 1 minutes 12 seconds | 0 | 1 | 70 | 71 SingleLevelAggregateHashMapSuite | 1 minutes 1 seconds | 0 | 0 | 50 | 50 DataFrameAggregateSuite | 59 seconds | 0 | 0 | 50 | 50 I checked the code of `SQLQueryTestSuite` and found `SQLQueryTestSuite` load test data repeatedly. This PR will improve the performance of `SQLQueryTestSuite`. The total time run `SQLQueryTestSuite` before and after this PR show below. Before No | Time -- | -- 1 | 20 minutes, 22 seconds 2 | 23 minutes, 21 seconds 3 | 21 minutes, 19 seconds 4 | 22 minutes, 26 seconds 5 | 20 minutes, 8 seconds After No | Time -- | -- 1 | 20 minutes, 52 seconds 2 | 20 minutes, 47 seconds 3 | 20 minutes, 7 seconds 4 | 21 minutes, 10 seconds 5 | 20 minutes, 4 seconds ### Why are the changes needed? Improve the performance of `SQLQueryTestSuite`. ### Does this PR introduce any user-facing change? 'No'. ### How was this patch tested? Jenkins test Closes #28060 from beliefer/avoid-load-test-data-repeatedly. Lead-authored-by: gengjiaan <gengjiaan@360.cn> Co-authored-by: beliefer <beliefer@163.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
8b4862953a
commit
014d33570b
|
@ -88,7 +88,7 @@ SELECT * FROM testdata LIMIT key > 3
|
|||
struct<>
|
||||
-- !query output
|
||||
org.apache.spark.sql.AnalysisException
|
||||
The limit expression must evaluate to a constant value, but got (testdata.`key` > 3);
|
||||
The limit expression must evaluate to a constant value, but got (spark_catalog.default.testdata.`key` > 3);
|
||||
|
||||
|
||||
-- !query
|
||||
|
|
|
@ -63,15 +63,9 @@ SHOW TABLES
|
|||
-- !query schema
|
||||
struct<database:string,tableName:string,isTemporary:boolean>
|
||||
-- !query output
|
||||
aggtest
|
||||
arraydata
|
||||
mapdata
|
||||
onek
|
||||
show_t1
|
||||
show_t2
|
||||
show_t3
|
||||
tenk1
|
||||
testdata
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -79,15 +73,9 @@ SHOW TABLES IN showdb
|
|||
-- !query schema
|
||||
struct<database:string,tableName:string,isTemporary:boolean>
|
||||
-- !query output
|
||||
aggtest
|
||||
arraydata
|
||||
mapdata
|
||||
onek
|
||||
show_t1
|
||||
show_t2
|
||||
show_t3
|
||||
tenk1
|
||||
testdata
|
||||
|
||||
|
||||
-- !query
|
||||
|
|
|
@ -63,12 +63,6 @@ SHOW VIEWS
|
|||
-- !query schema
|
||||
struct<namespace:string,viewName:string,isTemporary:boolean>
|
||||
-- !query output
|
||||
aggtest
|
||||
arraydata
|
||||
mapdata
|
||||
onek
|
||||
tenk1
|
||||
testdata
|
||||
view_1
|
||||
view_2
|
||||
view_4
|
||||
|
@ -79,12 +73,6 @@ SHOW VIEWS FROM showdb
|
|||
-- !query schema
|
||||
struct<namespace:string,viewName:string,isTemporary:boolean>
|
||||
-- !query output
|
||||
aggtest
|
||||
arraydata
|
||||
mapdata
|
||||
onek
|
||||
tenk1
|
||||
testdata
|
||||
view_1
|
||||
view_2
|
||||
view_4
|
||||
|
@ -95,12 +83,6 @@ SHOW VIEWS IN showdb
|
|||
-- !query schema
|
||||
struct<namespace:string,viewName:string,isTemporary:boolean>
|
||||
-- !query output
|
||||
aggtest
|
||||
arraydata
|
||||
mapdata
|
||||
onek
|
||||
tenk1
|
||||
testdata
|
||||
view_1
|
||||
view_2
|
||||
view_4
|
||||
|
@ -111,12 +93,6 @@ SHOW VIEWS IN global_temp
|
|||
-- !query schema
|
||||
struct<namespace:string,viewName:string,isTemporary:boolean>
|
||||
-- !query output
|
||||
aggtest
|
||||
arraydata
|
||||
mapdata
|
||||
onek
|
||||
tenk1
|
||||
testdata
|
||||
view_3
|
||||
view_4
|
||||
|
||||
|
|
|
@ -363,7 +363,6 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
// Create a local SparkSession to have stronger isolation between different test cases.
|
||||
// This does not isolate catalog changes.
|
||||
val localSparkSession = spark.newSession()
|
||||
loadTestData(localSparkSession)
|
||||
|
||||
testCase match {
|
||||
case udfTestCase: UDFTest =>
|
||||
|
@ -572,14 +571,20 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
}
|
||||
|
||||
/** Load built-in test tables into the SparkSession. */
|
||||
private def loadTestData(session: SparkSession): Unit = {
|
||||
private def createTestTables(session: SparkSession): Unit = {
|
||||
import session.implicits._
|
||||
|
||||
(1 to 100).map(i => (i, i.toString)).toDF("key", "value").createOrReplaceTempView("testdata")
|
||||
(1 to 100).map(i => (i, i.toString)).toDF("key", "value")
|
||||
.repartition(1)
|
||||
.write
|
||||
.format("parquet")
|
||||
.saveAsTable("testdata")
|
||||
|
||||
((Seq(1, 2, 3), Seq(Seq(1, 2, 3))) :: (Seq(2, 3, 4), Seq(Seq(2, 3, 4))) :: Nil)
|
||||
.toDF("arraycol", "nestedarraycol")
|
||||
.createOrReplaceTempView("arraydata")
|
||||
.write
|
||||
.format("parquet")
|
||||
.saveAsTable("arraydata")
|
||||
|
||||
(Tuple1(Map(1 -> "a1", 2 -> "b1", 3 -> "c1", 4 -> "d1", 5 -> "e1")) ::
|
||||
Tuple1(Map(1 -> "a2", 2 -> "b2", 3 -> "c2", 4 -> "d2")) ::
|
||||
|
@ -587,7 +592,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
Tuple1(Map(1 -> "a4", 2 -> "b4")) ::
|
||||
Tuple1(Map(1 -> "a5")) :: Nil)
|
||||
.toDF("mapcol")
|
||||
.createOrReplaceTempView("mapdata")
|
||||
.write
|
||||
.format("parquet")
|
||||
.saveAsTable("mapdata")
|
||||
|
||||
session
|
||||
.read
|
||||
|
@ -595,7 +602,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
.options(Map("delimiter" -> "\t", "header" -> "false"))
|
||||
.schema("a int, b float")
|
||||
.load(testFile("test-data/postgresql/agg.data"))
|
||||
.createOrReplaceTempView("aggtest")
|
||||
.write
|
||||
.format("parquet")
|
||||
.saveAsTable("aggtest")
|
||||
|
||||
session
|
||||
.read
|
||||
|
@ -621,7 +630,9 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
|string4 string
|
||||
""".stripMargin)
|
||||
.load(testFile("test-data/postgresql/onek.data"))
|
||||
.createOrReplaceTempView("onek")
|
||||
.write
|
||||
.format("parquet")
|
||||
.saveAsTable("onek")
|
||||
|
||||
session
|
||||
.read
|
||||
|
@ -647,7 +658,18 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
|string4 string
|
||||
""".stripMargin)
|
||||
.load(testFile("test-data/postgresql/tenk.data"))
|
||||
.createOrReplaceTempView("tenk1")
|
||||
.write
|
||||
.format("parquet")
|
||||
.saveAsTable("tenk1")
|
||||
}
|
||||
|
||||
private def removeTestTables(session: SparkSession): Unit = {
|
||||
session.sql("DROP TABLE IF EXISTS testdata")
|
||||
session.sql("DROP TABLE IF EXISTS arraydata")
|
||||
session.sql("DROP TABLE IF EXISTS mapdata")
|
||||
session.sql("DROP TABLE IF EXISTS aggtest")
|
||||
session.sql("DROP TABLE IF EXISTS onek")
|
||||
session.sql("DROP TABLE IF EXISTS tenk1")
|
||||
}
|
||||
|
||||
private val originalTimeZone = TimeZone.getDefault
|
||||
|
@ -655,6 +677,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
|
||||
override def beforeAll(): Unit = {
|
||||
super.beforeAll()
|
||||
createTestTables(spark)
|
||||
// Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
|
||||
TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
|
||||
// Add Locale setting
|
||||
|
@ -668,6 +691,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession {
|
|||
try {
|
||||
TimeZone.setDefault(originalTimeZone)
|
||||
Locale.setDefault(originalLocale)
|
||||
removeTestTables(spark)
|
||||
|
||||
// For debugging dump some statistics about how much time was spent in various optimizer rules
|
||||
logWarning(RuleExecutor.dumpTimeSpent())
|
||||
|
|
Loading…
Reference in a new issue