[SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's default values
## What changes were proposed in this pull request? When we create the HiveConf for metastore client, we use a Hadoop Conf as the base, which may contain Hive settings in hive-site.xml (https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala#L49). However, HiveConf's initialize function basically ignores the base Hadoop Conf and always its default values (i.e. settings with non-null default values) as the base (https://github.com/apache/hive/blob/release-1.2.1/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L2687). So, even a user put javax.jdo.option.ConnectionURL in hive-site.xml, it is not used and Hive will use its default, which is jdbc:derby:;databaseName=metastore_db;create=true. This issue only shows up when `spark.sql.hive.metastore.jars` is not set to builtin. ## How was this patch tested? New test in HiveSparkSubmitSuite. Author: Yin Huai <yhuai@databricks.com> Closes #14497 from yhuai/SPARK-16901.
This commit is contained in:
parent
6cbde337a5
commit
e679bc3c1c
|
@ -141,14 +141,32 @@ private[hive] class HiveClientImpl(
|
|||
// so we should keep `conf` and reuse the existing instance of `CliSessionState`.
|
||||
originalState
|
||||
} else {
|
||||
val hiveConf = new HiveConf(hadoopConf, classOf[SessionState])
|
||||
val hiveConf = new HiveConf(classOf[SessionState])
|
||||
// 1: we set all confs in the hadoopConf to this hiveConf.
|
||||
// This hadoopConf contains user settings in Hadoop's core-site.xml file
|
||||
// and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in
|
||||
// SharedState and put settings in this hadoopConf instead of relying on HiveConf
|
||||
// to load user settings. Otherwise, HiveConf's initialize method will override
|
||||
// settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars
|
||||
// is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath
|
||||
// has hive-site.xml. So, HiveConf will use that to override its default values.
|
||||
hadoopConf.iterator().asScala.foreach { entry =>
|
||||
val key = entry.getKey
|
||||
val value = entry.getValue
|
||||
if (key.toLowerCase.contains("password")) {
|
||||
logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=xxx")
|
||||
} else {
|
||||
logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=$value")
|
||||
}
|
||||
hiveConf.set(key, value)
|
||||
}
|
||||
// HiveConf is a Hadoop Configuration, which has a field of classLoader and
|
||||
// the initial value will be the current thread's context class loader
|
||||
// (i.e. initClassLoader at here).
|
||||
// We call initialConf.setClassLoader(initClassLoader) at here to make
|
||||
// this action explicit.
|
||||
hiveConf.setClassLoader(initClassLoader)
|
||||
// First, we set all spark confs to this hiveConf.
|
||||
// 2: we set all spark confs to this hiveConf.
|
||||
sparkConf.getAll.foreach { case (k, v) =>
|
||||
if (k.toLowerCase.contains("password")) {
|
||||
logDebug(s"Applying Spark config to Hive Conf: $k=xxx")
|
||||
|
@ -157,7 +175,7 @@ private[hive] class HiveClientImpl(
|
|||
}
|
||||
hiveConf.set(k, v)
|
||||
}
|
||||
// Second, we set all entries in config to this hiveConf.
|
||||
// 3: we set all entries in config to this hiveConf.
|
||||
extraConfig.foreach { case (k, v) =>
|
||||
if (k.toLowerCase.contains("password")) {
|
||||
logDebug(s"Applying extra config to HiveConf: $k=xxx")
|
||||
|
|
|
@ -253,6 +253,47 @@ class HiveSparkSubmitSuite
|
|||
runSparkSubmit(args)
|
||||
}
|
||||
|
||||
test("SPARK-16901: set javax.jdo.option.ConnectionURL") {
|
||||
// In this test, we set javax.jdo.option.ConnectionURL and set metastore version to
|
||||
// 0.13. This test will make sure that javax.jdo.option.ConnectionURL will not be
|
||||
// overridden by hive's default settings when we create a HiveConf object inside
|
||||
// HiveClientImpl. Please see SPARK-16901 for more details.
|
||||
|
||||
val metastoreLocation = Utils.createTempDir()
|
||||
metastoreLocation.delete()
|
||||
val metastoreURL =
|
||||
s"jdbc:derby:memory:;databaseName=${metastoreLocation.getAbsolutePath};create=true"
|
||||
val hiveSiteXmlContent =
|
||||
s"""
|
||||
|<configuration>
|
||||
| <property>
|
||||
| <name>javax.jdo.option.ConnectionURL</name>
|
||||
| <value>$metastoreURL</value>
|
||||
| </property>
|
||||
|</configuration>
|
||||
""".stripMargin
|
||||
|
||||
// Write a hive-site.xml containing a setting of hive.metastore.warehouse.dir.
|
||||
val hiveSiteDir = Utils.createTempDir()
|
||||
val file = new File(hiveSiteDir.getCanonicalPath, "hive-site.xml")
|
||||
val bw = new BufferedWriter(new FileWriter(file))
|
||||
bw.write(hiveSiteXmlContent)
|
||||
bw.close()
|
||||
|
||||
val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
|
||||
val args = Seq(
|
||||
"--class", SetMetastoreURLTest.getClass.getName.stripSuffix("$"),
|
||||
"--name", "SetMetastoreURLTest",
|
||||
"--master", "local[1]",
|
||||
"--conf", "spark.ui.enabled=false",
|
||||
"--conf", "spark.master.rest.enabled=false",
|
||||
"--conf", s"spark.sql.test.expectedMetastoreURL=$metastoreURL",
|
||||
"--conf", s"spark.driver.extraClassPath=${hiveSiteDir.getCanonicalPath}",
|
||||
"--driver-java-options", "-Dderby.system.durability=test",
|
||||
unusedJar.toString)
|
||||
runSparkSubmit(args)
|
||||
}
|
||||
|
||||
// NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
|
||||
// This is copied from org.apache.spark.deploy.SparkSubmitSuite
|
||||
private def runSparkSubmit(args: Seq[String]): Unit = {
|
||||
|
@ -313,6 +354,45 @@ class HiveSparkSubmitSuite
|
|||
}
|
||||
}
|
||||
|
||||
object SetMetastoreURLTest extends Logging {
|
||||
def main(args: Array[String]): Unit = {
|
||||
Utils.configTestLog4j("INFO")
|
||||
|
||||
val sparkConf = new SparkConf(loadDefaults = true)
|
||||
val builder = SparkSession.builder()
|
||||
.config(sparkConf)
|
||||
.config("spark.ui.enabled", "false")
|
||||
.config("spark.sql.hive.metastore.version", "0.13.1")
|
||||
// The issue described in SPARK-16901 only appear when
|
||||
// spark.sql.hive.metastore.jars is not set to builtin.
|
||||
.config("spark.sql.hive.metastore.jars", "maven")
|
||||
.enableHiveSupport()
|
||||
|
||||
val spark = builder.getOrCreate()
|
||||
val expectedMetastoreURL =
|
||||
spark.conf.get("spark.sql.test.expectedMetastoreURL")
|
||||
logInfo(s"spark.sql.test.expectedMetastoreURL is $expectedMetastoreURL")
|
||||
|
||||
if (expectedMetastoreURL == null) {
|
||||
throw new Exception(
|
||||
s"spark.sql.test.expectedMetastoreURL should be set.")
|
||||
}
|
||||
|
||||
// HiveSharedState is used when Hive support is enabled.
|
||||
val actualMetastoreURL =
|
||||
spark.sharedState.asInstanceOf[HiveSharedState]
|
||||
.metadataHive
|
||||
.getConf("javax.jdo.option.ConnectionURL", "this_is_a_wrong_URL")
|
||||
logInfo(s"javax.jdo.option.ConnectionURL is $actualMetastoreURL")
|
||||
|
||||
if (actualMetastoreURL != expectedMetastoreURL) {
|
||||
throw new Exception(
|
||||
s"Expected value of javax.jdo.option.ConnectionURL is $expectedMetastoreURL. But, " +
|
||||
s"the actual value is $actualMetastoreURL")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object SetWarehouseLocationTest extends Logging {
|
||||
def main(args: Array[String]): Unit = {
|
||||
Utils.configTestLog4j("INFO")
|
||||
|
|
Loading…
Reference in a new issue