[SPARK-18856][SQL] non-empty partitioned table should not report zero size

## What changes were proposed in this pull request?

In `DataSource`, if the table is not analyzed, we will use 0 as the default value for table size. This is dangerous, we may broadcast a large table and cause OOM. We should use `defaultSizeInBytes` instead.

## How was this patch tested?

new regression test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #16280 from cloud-fan/bug.
This commit is contained in:
Wenchen Fan 2016-12-14 21:03:56 -08:00 committed by Reynold Xin
parent 8db4d95c02
commit d6f11a12a1
2 changed files with 20 additions and 1 deletions

View file

@ -388,10 +388,11 @@ case class DataSource(
val fileCatalog = if (sparkSession.sqlContext.conf.manageFilesourcePartitions &&
catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog) {
val defaultTableSize = sparkSession.sessionState.conf.defaultSizeInBytes
new CatalogFileIndex(
sparkSession,
catalogTable.get,
catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(0L))
catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(defaultTableSize))
} else {
new InMemoryFileIndex(sparkSession, globbedPaths, options, Some(partitionSchema))
}

View file

@ -26,6 +26,7 @@ import scala.util.Random
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.internal.StaticSQLConf
import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
import org.apache.spark.sql.test.SQLTestData.ArrayData
import org.apache.spark.sql.types._
@ -176,6 +177,7 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
* when using the Hive external catalog) as well as in the sql/core module.
*/
abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils {
import testImplicits._
private val dec1 = new java.math.BigDecimal("1.000000000000000000")
private val dec2 = new java.math.BigDecimal("8.000000000000000000")
@ -242,4 +244,20 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
}
}
}
// This test will be run twice: with and without Hive support
test("SPARK-18856: non-empty partitioned table should not report zero size") {
withTable("ds_tbl", "hive_tbl") {
spark.range(100).select($"id", $"id" % 5 as "p").write.partitionBy("p").saveAsTable("ds_tbl")
val stats = spark.table("ds_tbl").queryExecution.optimizedPlan.statistics
assert(stats.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
if (spark.conf.get(StaticSQLConf.CATALOG_IMPLEMENTATION) == "hive") {
sql("CREATE TABLE hive_tbl(i int) PARTITIONED BY (j int)")
sql("INSERT INTO hive_tbl PARTITION(j=1) SELECT 1")
val stats2 = spark.table("hive_tbl").queryExecution.optimizedPlan.statistics
assert(stats2.sizeInBytes > 0, "non-empty partitioned table should not report zero size.")
}
}
}
}